Moved test resources to maven's test resources directory
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/3f1cf76f Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/3f1cf76f Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/3f1cf76f Branch: refs/heads/NUTCH-2292 Commit: 3f1cf76fb87ecd8f387c4214430f7d67e33a8980 Parents: 20d2840 Author: Thamme Gowda <[email protected]> Authored: Tue Jul 5 15:37:22 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Tue Jul 5 15:37:22 2016 -0700 ---------------------------------------------------------------------- nutch-plugins/creativecommons/data/anchor.html | 9 - nutch-plugins/creativecommons/data/rdf.html | 35 -- nutch-plugins/creativecommons/data/rel.html | 6 - .../src/test/resources/anchor.html | 9 + .../creativecommons/src/test/resources/rdf.html | 35 ++ .../creativecommons/src/test/resources/rel.html | 6 + nutch-plugins/feed/sample/rsstest.rss | 36 -- .../feed/src/test/resources/rsstest.rss | 36 ++ .../index-replace/sample/testIndexReplace.html | 12 - .../src/test/resources/testIndexReplace.html | 12 + .../mimetype-filter/sample/allow-images.txt | 34 -- .../mimetype-filter/sample/block-html.txt | 34 -- .../src/test/resources/allow-images.txt | 34 ++ .../src/test/resources/block-html.txt | 34 ++ .../parse-metatags/sample/testMetatags.html | 9 - .../sample/testMultivalueMetatags.html | 12 - .../src/test/resources/testMetatags.html | 9 + .../test/resources/testMultivalueMetatags.html | 12 + .../parse-replace/sample/testParseReplace.html | 11 - .../src/test/resources/testParseReplace.html | 11 + nutch-plugins/parse-swf/sample/test1.swf | Bin 21054 -> 0 bytes nutch-plugins/parse-swf/sample/test1.txt | 60 --- nutch-plugins/parse-swf/sample/test2.swf | Bin 42534 -> 0 bytes nutch-plugins/parse-swf/sample/test2.txt | 5 - nutch-plugins/parse-swf/sample/test3.swf | Bin 51562 -> 0 bytes nutch-plugins/parse-swf/sample/test3.txt | 11 - .../parse-swf/src/test/resources/test1.swf | Bin 0 -> 21054 bytes .../parse-swf/src/test/resources/test1.txt | 60 +++ .../parse-swf/src/test/resources/test2.swf | Bin 0 -> 42534 bytes .../parse-swf/src/test/resources/test2.txt | 5 + .../parse-swf/src/test/resources/test3.swf | Bin 0 -> 51562 bytes .../parse-swf/src/test/resources/test3.txt | 11 + nutch-plugins/parse-tika/sample/encrypted.pdf | Bin 3431 -> 0 bytes nutch-plugins/parse-tika/sample/nutch.html | 519 ------------------- .../parse-tika/sample/nutch_logo_tm.gif | Bin 2747 -> 0 bytes nutch-plugins/parse-tika/sample/ootest.odt | Bin 20753 -> 0 bytes nutch-plugins/parse-tika/sample/ootest.sxw | Bin 20125 -> 0 bytes nutch-plugins/parse-tika/sample/ootest.txt | 30 -- nutch-plugins/parse-tika/sample/pdftest.pdf | 157 ------ nutch-plugins/parse-tika/sample/rsstest.rss | 37 -- nutch-plugins/parse-tika/sample/test.rtf | 17 - nutch-plugins/parse-tika/sample/word97.doc | Bin 8192 -> 0 bytes .../parse-tika/src/test/resources/encrypted.pdf | Bin 0 -> 3431 bytes .../parse-tika/src/test/resources/nutch.html | 519 +++++++++++++++++++ .../src/test/resources/nutch_logo_tm.gif | Bin 0 -> 2747 bytes .../parse-tika/src/test/resources/ootest.odt | Bin 0 -> 20753 bytes .../parse-tika/src/test/resources/ootest.sxw | Bin 0 -> 20125 bytes .../parse-tika/src/test/resources/ootest.txt | 30 ++ .../parse-tika/src/test/resources/pdftest.pdf | 157 ++++++ .../parse-tika/src/test/resources/rsstest.rss | 37 ++ .../parse-tika/src/test/resources/test.rtf | 17 + .../parse-tika/src/test/resources/word97.doc | Bin 0 -> 8192 bytes nutch-plugins/parse-zip/sample/test.zip | Bin 182 -> 0 bytes .../parse-zip/src/test/resources/test.zip | Bin 0 -> 182 bytes .../data/regex-parsefilter.txt | 10 - .../src/test/resources/regex-parsefilter.txt | 10 + .../protocol-file/sample/testprotocolfile.txt | 1 - .../sample/testprotocolfile_(encoded).txt | 1 - .../src/test/resources/testprotocolfile.txt | 1 + .../resources/testprotocolfile_(encoded).txt | 1 + .../urlfilter-automaton/sample/Benchmarks.rules | 26 - .../urlfilter-automaton/sample/Benchmarks.urls | 297 ----------- .../sample/IntranetCrawling.rules | 24 - .../sample/IntranetCrawling.urls | 8 - .../sample/WholeWebCrawling.rules | 19 - .../sample/WholeWebCrawling.urls | 11 - .../src/test/resources/Benchmarks.rules | 26 + .../src/test/resources/Benchmarks.urls | 297 +++++++++++ .../src/test/resources/IntranetCrawling.rules | 24 + .../src/test/resources/IntranetCrawling.urls | 8 + .../src/test/resources/WholeWebCrawling.rules | 19 + .../src/test/resources/WholeWebCrawling.urls | 11 + nutch-plugins/urlfilter-domain/data/hosts.txt | 5 - .../src/test/resources/hosts.txt | 5 + .../urlfilter-domainblacklist/data/hosts.txt | 5 - .../src/test/resources/hosts.txt | 5 + .../urlfilter-ignoreexempt/data/.donotdelete | 0 .../urlfilter-regex/sample/Benchmarks.rules | 26 - .../urlfilter-regex/sample/Benchmarks.urls | 297 ----------- .../sample/IntranetCrawling.rules | 27 - .../sample/IntranetCrawling.urls | 8 - .../sample/WholeWebCrawling.rules | 22 - .../sample/WholeWebCrawling.urls | 11 - .../urlfilter-regex/sample/nutch1838.rules | 12 - .../urlfilter-regex/sample/nutch1838.urls | 3 - .../src/test/resources/Benchmarks.rules | 26 + .../src/test/resources/Benchmarks.urls | 297 +++++++++++ .../src/test/resources/IntranetCrawling.rules | 27 + .../src/test/resources/IntranetCrawling.urls | 8 + .../src/test/resources/WholeWebCrawling.rules | 22 + .../src/test/resources/WholeWebCrawling.urls | 11 + .../src/test/resources/nutch1838.rules | 12 + .../src/test/resources/nutch1838.urls | 3 + nutch-plugins/urlnormalizer-host/data/hosts.txt | 8 - .../src/test/resources/hosts.txt | 8 + .../urlnormalizer-protocol/data/protocols.txt | 7 - .../src/test/resources/protocols.txt | 7 + .../sample/regex-normalize-default.test | 84 --- .../sample/regex-normalize-default.xml | 66 --- .../sample/regex-normalize-scope1.test | 8 - .../sample/regex-normalize-scope1.xml | 21 - .../test/resources/regex-normalize-default.test | 84 +++ .../test/resources/regex-normalize-default.xml | 66 +++ .../test/resources/regex-normalize-scope1.test | 8 + .../test/resources/regex-normalize-scope1.xml | 21 + .../urlnormalizer-slash/data/slashes.txt | 7 - .../src/test/resources/slashes.txt | 7 + 107 files changed, 2048 insertions(+), 2048 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/creativecommons/data/anchor.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/data/anchor.html b/nutch-plugins/creativecommons/data/anchor.html deleted file mode 100755 index 90b5227..0000000 --- a/nutch-plugins/creativecommons/data/anchor.html +++ /dev/null @@ -1,9 +0,0 @@ -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd"> -<html> -<head> -</head> -<body> -<p><a href="http://creativecommons.org/licenses/by-nc-sa/1.0"><img alt="Creative Commons License" src="http://creativecommons.org/images/public/somerights.gif" align="right"></a>This file is licensed under a -<a href="http://creativecommons.org/licenses/by-nc-sa/1.0">Creative Commons License</a>.</p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/creativecommons/data/rdf.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/data/rdf.html b/nutch-plugins/creativecommons/data/rdf.html deleted file mode 100755 index fb2c34d..0000000 --- a/nutch-plugins/creativecommons/data/rdf.html +++ /dev/null @@ -1,35 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> -<html> - <head> - </head> - <body> - -<!-- Creative Commons License --> -<p><a href="http://creativecommons.org/licenses/by-nc/1.0"><img alt="Creative Commons License" border="0" src="http://creativecommons.org/images/public/somerights.gif" /></a><br /> -This work is licensed under a -<a href="http://creativecommons.org/licenses/by-nc/1.0">Creative Commons License</a>. -<!-- end Creative Commons License --> - - <!-- -<rdf:RDF xmlns="http://web.resource.org/cc/" - xmlns:dc="http://purl.org/dc/elements/1.1/" - xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> -<Work rdf:about="http://boingboing.net"> - <dc:type rdf:resource="http://purl.org/dc/dcmitype/Text" /> - <license rdf:resource="http://creativecommons.org/licenses/by-nc/1.0" /> -</Work> - -<License rdf:about="http://creativecommons.org/licenses/by-nc/1.0"> - <requires rdf:resource="http://web.resource.org/cc/Attribution" /> - <permits rdf:resource="http://web.resource.org/cc/DerivativeWorks" /> - <permits rdf:resource="http://web.resource.org/cc/Reproduction" /> - <permits rdf:resource="http://web.resource.org/cc/Distribution" /> - <prohibits rdf:resource="http://web.resource.org/cc/CommercialUse" /> - <requires rdf:resource="http://web.resource.org/cc/Notice" /> -</License> - -</rdf:RDF> - ---> - </body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/creativecommons/data/rel.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/data/rel.html b/nutch-plugins/creativecommons/data/rel.html deleted file mode 100755 index 413d52f..0000000 --- a/nutch-plugins/creativecommons/data/rel.html +++ /dev/null @@ -1,6 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head> -</head><body> -<a rel="license" href="http://creativecommons.org/licenses/by-nc/2.0">CC by-nc</a> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/creativecommons/src/test/resources/anchor.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/src/test/resources/anchor.html b/nutch-plugins/creativecommons/src/test/resources/anchor.html new file mode 100755 index 0000000..90b5227 --- /dev/null +++ b/nutch-plugins/creativecommons/src/test/resources/anchor.html @@ -0,0 +1,9 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd"> +<html> +<head> +</head> +<body> +<p><a href="http://creativecommons.org/licenses/by-nc-sa/1.0"><img alt="Creative Commons License" src="http://creativecommons.org/images/public/somerights.gif" align="right"></a>This file is licensed under a +<a href="http://creativecommons.org/licenses/by-nc-sa/1.0">Creative Commons License</a>.</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/creativecommons/src/test/resources/rdf.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/src/test/resources/rdf.html b/nutch-plugins/creativecommons/src/test/resources/rdf.html new file mode 100755 index 0000000..fb2c34d --- /dev/null +++ b/nutch-plugins/creativecommons/src/test/resources/rdf.html @@ -0,0 +1,35 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> + <head> + </head> + <body> + +<!-- Creative Commons License --> +<p><a href="http://creativecommons.org/licenses/by-nc/1.0"><img alt="Creative Commons License" border="0" src="http://creativecommons.org/images/public/somerights.gif" /></a><br /> +This work is licensed under a +<a href="http://creativecommons.org/licenses/by-nc/1.0">Creative Commons License</a>. +<!-- end Creative Commons License --> + + <!-- +<rdf:RDF xmlns="http://web.resource.org/cc/" + xmlns:dc="http://purl.org/dc/elements/1.1/" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> +<Work rdf:about="http://boingboing.net"> + <dc:type rdf:resource="http://purl.org/dc/dcmitype/Text" /> + <license rdf:resource="http://creativecommons.org/licenses/by-nc/1.0" /> +</Work> + +<License rdf:about="http://creativecommons.org/licenses/by-nc/1.0"> + <requires rdf:resource="http://web.resource.org/cc/Attribution" /> + <permits rdf:resource="http://web.resource.org/cc/DerivativeWorks" /> + <permits rdf:resource="http://web.resource.org/cc/Reproduction" /> + <permits rdf:resource="http://web.resource.org/cc/Distribution" /> + <prohibits rdf:resource="http://web.resource.org/cc/CommercialUse" /> + <requires rdf:resource="http://web.resource.org/cc/Notice" /> +</License> + +</rdf:RDF> + +--> + </body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/creativecommons/src/test/resources/rel.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/creativecommons/src/test/resources/rel.html b/nutch-plugins/creativecommons/src/test/resources/rel.html new file mode 100755 index 0000000..413d52f --- /dev/null +++ b/nutch-plugins/creativecommons/src/test/resources/rel.html @@ -0,0 +1,6 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head> +</head><body> +<a rel="license" href="http://creativecommons.org/licenses/by-nc/2.0">CC by-nc</a> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/feed/sample/rsstest.rss ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/sample/rsstest.rss b/nutch-plugins/feed/sample/rsstest.rss deleted file mode 100644 index 758f6a1..0000000 --- a/nutch-plugins/feed/sample/rsstest.rss +++ /dev/null @@ -1,36 +0,0 @@ -<?xml version="1.0" encoding="ISO-8859-1" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<rss version="0.91"> - <channel> - <title>TestChannel</title> - <link>http://test.channel.com/</link> - <description>Sample RSS File for Junit test</description> - <language>en-us</language> - - <item> - <title>Home Page of Chris Mattmann</title> - <link>http://www-scf.usc.edu/~mattmann/</link> - <description>Chris Mattmann's home page</description> - </item> - <item> - <title>Awesome Open Source Search Engine</title> - <link>http://www.nutch.org/</link> - <description>Yup, that's what it is</description> - </item> - </channel> -</rss> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/feed/src/test/resources/rsstest.rss ---------------------------------------------------------------------- diff --git a/nutch-plugins/feed/src/test/resources/rsstest.rss b/nutch-plugins/feed/src/test/resources/rsstest.rss new file mode 100644 index 0000000..758f6a1 --- /dev/null +++ b/nutch-plugins/feed/src/test/resources/rsstest.rss @@ -0,0 +1,36 @@ +<?xml version="1.0" encoding="ISO-8859-1" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<rss version="0.91"> + <channel> + <title>TestChannel</title> + <link>http://test.channel.com/</link> + <description>Sample RSS File for Junit test</description> + <language>en-us</language> + + <item> + <title>Home Page of Chris Mattmann</title> + <link>http://www-scf.usc.edu/~mattmann/</link> + <description>Chris Mattmann's home page</description> + </item> + <item> + <title>Awesome Open Source Search Engine</title> + <link>http://www.nutch.org/</link> + <description>Yup, that's what it is</description> + </item> + </channel> +</rss> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/index-replace/sample/testIndexReplace.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/sample/testIndexReplace.html b/nutch-plugins/index-replace/sample/testIndexReplace.html deleted file mode 100644 index 0b90fc2..0000000 --- a/nutch-plugins/index-replace/sample/testIndexReplace.html +++ /dev/null @@ -1,12 +0,0 @@ -<html> - <head> - <title>Testing the power of the index-replace plugin</title> - <meta name="description" content="With this plugin, I control the description! Bwuhuhuhaha!"> - <meta name="keywords" content="Breathtaking, Riveting, Two Thumbs Up!"> - <meta name="author" content="Peter Ciuffetti"> - </head> - <body> - <p>This html file is used to test the Nutch index-replace regexp replacer plugin. - A decidedly boring thing to do.</p> - </body> -</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html b/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html new file mode 100644 index 0000000..0b90fc2 --- /dev/null +++ b/nutch-plugins/index-replace/src/test/resources/testIndexReplace.html @@ -0,0 +1,12 @@ +<html> + <head> + <title>Testing the power of the index-replace plugin</title> + <meta name="description" content="With this plugin, I control the description! Bwuhuhuhaha!"> + <meta name="keywords" content="Breathtaking, Riveting, Two Thumbs Up!"> + <meta name="author" content="Peter Ciuffetti"> + </head> + <body> + <p>This html file is used to test the Nutch index-replace regexp replacer plugin. + A decidedly boring thing to do.</p> + </body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/mimetype-filter/sample/allow-images.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/sample/allow-images.txt b/nutch-plugins/mimetype-filter/sample/allow-images.txt deleted file mode 100644 index 0f5f136..0000000 --- a/nutch-plugins/mimetype-filter/sample/allow-images.txt +++ /dev/null @@ -1,34 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This filter can be configured to work in one of two modes (similar to -# suffix-url-filter) - -# default to reject ('-'): in this mode, only documents with a mimetype that -# match the ones specified in the config file will be accepted, all other -# mimetypes will be rejected. - -# default to accept ('+'): in this mode, only documents with a mimetype -# that match the ones specified in the config file will be rejected, -# all other mimetypes will be accepted. - -# The format of this config file is one mimetype per line, with no preceding -# whitespace. Order, in which suffixes are specified, doesn't matter. Blank -# lines and comments (#) are allowed. -# - -- - -image http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/mimetype-filter/sample/block-html.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/sample/block-html.txt b/nutch-plugins/mimetype-filter/sample/block-html.txt deleted file mode 100644 index 69600ec..0000000 --- a/nutch-plugins/mimetype-filter/sample/block-html.txt +++ /dev/null @@ -1,34 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This filter can be configured to work in one of two modes (similar to -# suffix-url-filter) - -# default to reject ('-'): in this mode, only documents with a mimetype that -# match the ones specified in the config file will be accepted, all other -# mimetypes will be rejected. - -# default to accept ('+'): in this mode, only documents with a mimetype -# that match the ones specified in the config file will be rejected, -# all other mimetypes will be accepted. - -# The format of this config file is one mimetype per line, with no preceding -# whitespace. Order, in which suffixes are specified, doesn't matter. Blank -# lines and comments (#) are allowed. -# - -+ - -text/html \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt b/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt new file mode 100644 index 0000000..0f5f136 --- /dev/null +++ b/nutch-plugins/mimetype-filter/src/test/resources/allow-images.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This filter can be configured to work in one of two modes (similar to +# suffix-url-filter) + +# default to reject ('-'): in this mode, only documents with a mimetype that +# match the ones specified in the config file will be accepted, all other +# mimetypes will be rejected. + +# default to accept ('+'): in this mode, only documents with a mimetype +# that match the ones specified in the config file will be rejected, +# all other mimetypes will be accepted. + +# The format of this config file is one mimetype per line, with no preceding +# whitespace. Order, in which suffixes are specified, doesn't matter. Blank +# lines and comments (#) are allowed. +# + +- + +image http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt b/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt new file mode 100644 index 0000000..69600ec --- /dev/null +++ b/nutch-plugins/mimetype-filter/src/test/resources/block-html.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This filter can be configured to work in one of two modes (similar to +# suffix-url-filter) + +# default to reject ('-'): in this mode, only documents with a mimetype that +# match the ones specified in the config file will be accepted, all other +# mimetypes will be rejected. + +# default to accept ('+'): in this mode, only documents with a mimetype +# that match the ones specified in the config file will be rejected, +# all other mimetypes will be accepted. + +# The format of this config file is one mimetype per line, with no preceding +# whitespace. Order, in which suffixes are specified, doesn't matter. Blank +# lines and comments (#) are allowed. +# + ++ + +text/html \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-metatags/sample/testMetatags.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/sample/testMetatags.html b/nutch-plugins/parse-metatags/sample/testMetatags.html deleted file mode 100644 index e9e8e6b..0000000 --- a/nutch-plugins/parse-metatags/sample/testMetatags.html +++ /dev/null @@ -1,9 +0,0 @@ -<html> -<head> -<meta name="Keywords" content="This is a test of keywords" /> -<meta name="Description" content="This is a test of description" /> -</head> -<body> -text of the document -</body> - http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html b/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html deleted file mode 100644 index ca8b737..0000000 --- a/nutch-plugins/parse-metatags/sample/testMultivalueMetatags.html +++ /dev/null @@ -1,12 +0,0 @@ -<html> -<head> -<meta name="DC.creator" content="Doug Cutting"> -<meta name="DC.creator" content="Michael Cafarella"> -<!-- meta keywords in different casing --> -<meta name="keywords" lang="en" content="web crawler" /> -<meta name="Keywords" lang="fr" content="robot d'indexation" /> -<meta name="KEYWORDS" lang="de" content="Webcrawler" /> -</head> -<body> -A test for multi-valued metatags. -</body> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-metatags/src/test/resources/testMetatags.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/src/test/resources/testMetatags.html b/nutch-plugins/parse-metatags/src/test/resources/testMetatags.html new file mode 100644 index 0000000..e9e8e6b --- /dev/null +++ b/nutch-plugins/parse-metatags/src/test/resources/testMetatags.html @@ -0,0 +1,9 @@ +<html> +<head> +<meta name="Keywords" content="This is a test of keywords" /> +<meta name="Description" content="This is a test of description" /> +</head> +<body> +text of the document +</body> + http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-metatags/src/test/resources/testMultivalueMetatags.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-metatags/src/test/resources/testMultivalueMetatags.html b/nutch-plugins/parse-metatags/src/test/resources/testMultivalueMetatags.html new file mode 100644 index 0000000..ca8b737 --- /dev/null +++ b/nutch-plugins/parse-metatags/src/test/resources/testMultivalueMetatags.html @@ -0,0 +1,12 @@ +<html> +<head> +<meta name="DC.creator" content="Doug Cutting"> +<meta name="DC.creator" content="Michael Cafarella"> +<!-- meta keywords in different casing --> +<meta name="keywords" lang="en" content="web crawler" /> +<meta name="Keywords" lang="fr" content="robot d'indexation" /> +<meta name="KEYWORDS" lang="de" content="Webcrawler" /> +</head> +<body> +A test for multi-valued metatags. +</body> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-replace/sample/testParseReplace.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/sample/testParseReplace.html b/nutch-plugins/parse-replace/sample/testParseReplace.html deleted file mode 100644 index 825dcb9..0000000 --- a/nutch-plugins/parse-replace/sample/testParseReplace.html +++ /dev/null @@ -1,11 +0,0 @@ -<html> - <head> - <title>Testing the power of parser-replace plugin</title> - <meta name="description" content="With this plugin, nutch is my bitch! Bwuhuhuhaha!"> - <meta name="keywords" content="Awesome, Riveting, Two Thumbs Up!"> - <meta name="author" content="Peter Ciuffetti"> - </head> - <body> - <p>This html file is used to test the Nutch parse-replace regexp replacer plugin. A decidely boring thing to do.</p> - </body> -</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-replace/src/test/resources/testParseReplace.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-replace/src/test/resources/testParseReplace.html b/nutch-plugins/parse-replace/src/test/resources/testParseReplace.html new file mode 100644 index 0000000..825dcb9 --- /dev/null +++ b/nutch-plugins/parse-replace/src/test/resources/testParseReplace.html @@ -0,0 +1,11 @@ +<html> + <head> + <title>Testing the power of parser-replace plugin</title> + <meta name="description" content="With this plugin, nutch is my bitch! Bwuhuhuhaha!"> + <meta name="keywords" content="Awesome, Riveting, Two Thumbs Up!"> + <meta name="author" content="Peter Ciuffetti"> + </head> + <body> + <p>This html file is used to test the Nutch parse-replace regexp replacer plugin. A decidely boring thing to do.</p> + </body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/sample/test1.swf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/sample/test1.swf b/nutch-plugins/parse-swf/sample/test1.swf deleted file mode 100644 index cd2019b..0000000 Binary files a/nutch-plugins/parse-swf/sample/test1.swf and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/sample/test1.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/sample/test1.txt b/nutch-plugins/parse-swf/sample/test1.txt deleted file mode 100644 index 68505d5..0000000 --- a/nutch-plugins/parse-swf/sample/test1.txt +++ /dev/null @@ -1,60 +0,0 @@ - --------- -/go/gnav_cart -/go/gnav_company -/go/gnav_devnet -/go/gnav_downloads -/go/gnav_fl_minmessage -/go/gnav_help -/go/gnav_mm_home -/go/gnav_products -/go/gnav_search?loc=en_us -/go/gnav_showcase -/go/gnav_solutions -/go/gnav_store -/go/gnav_support -/go/gnav_your_account -Acquisition Info -Adobe Home -AppleGothic -Array -Company -Developers -Downloads -Help -Home -International -LocaleManager -Macromedia Flash Player -Macromedia Home -MovieClip -Products -Showcase -Solutions -Store -String -Support -TextFormat -To ensure the best possible Internet Experience, please download the latest version of the free -Verdana -_sans -active -bluePill -button -color -company -devnet -downloads -en_us -home -javascript:openCrosslinkWindow('/go/adobeacquisition') -javascript:openCrosslinkWindow('/go/gnav_adobe_home') -products -rollOut -rollOver -selected -showcase -solutions -support -tabHolder -textColor http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/sample/test2.swf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/sample/test2.swf b/nutch-plugins/parse-swf/sample/test2.swf deleted file mode 100644 index eb9b03d..0000000 Binary files a/nutch-plugins/parse-swf/sample/test2.swf and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/sample/test2.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/sample/test2.txt b/nutch-plugins/parse-swf/sample/test2.txt deleted file mode 100644 index f77b78a..0000000 --- a/nutch-plugins/parse-swf/sample/test2.txt +++ /dev/null @@ -1,5 +0,0 @@ -Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini --------- -TextFormat -color -font http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/sample/test3.swf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/sample/test3.swf b/nutch-plugins/parse-swf/sample/test3.swf deleted file mode 100644 index 4df9f1e..0000000 Binary files a/nutch-plugins/parse-swf/sample/test3.swf and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/sample/test3.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/sample/test3.txt b/nutch-plugins/parse-swf/sample/test3.txt deleted file mode 100644 index 66ae3d8..0000000 --- a/nutch-plugins/parse-swf/sample/test3.txt +++ /dev/null @@ -1,11 +0,0 @@ -Mix. - Edit. - Master. - Compose. - Animate. - With a single suite of powerful tools - that work together as one. - World-class video and audio tools that bring - new power and efficiency to your film, video, - DVD, and web workflows. - Learn more. http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/src/test/resources/test1.swf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.swf b/nutch-plugins/parse-swf/src/test/resources/test1.swf new file mode 100644 index 0000000..cd2019b Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test1.swf differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/src/test/resources/test1.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.txt b/nutch-plugins/parse-swf/src/test/resources/test1.txt new file mode 100644 index 0000000..68505d5 --- /dev/null +++ b/nutch-plugins/parse-swf/src/test/resources/test1.txt @@ -0,0 +1,60 @@ + +-------- +/go/gnav_cart +/go/gnav_company +/go/gnav_devnet +/go/gnav_downloads +/go/gnav_fl_minmessage +/go/gnav_help +/go/gnav_mm_home +/go/gnav_products +/go/gnav_search?loc=en_us +/go/gnav_showcase +/go/gnav_solutions +/go/gnav_store +/go/gnav_support +/go/gnav_your_account +Acquisition Info +Adobe Home +AppleGothic +Array +Company +Developers +Downloads +Help +Home +International +LocaleManager +Macromedia Flash Player +Macromedia Home +MovieClip +Products +Showcase +Solutions +Store +String +Support +TextFormat +To ensure the best possible Internet Experience, please download the latest version of the free +Verdana +_sans +active +bluePill +button +color +company +devnet +downloads +en_us +home +javascript:openCrosslinkWindow('/go/adobeacquisition') +javascript:openCrosslinkWindow('/go/gnav_adobe_home') +products +rollOut +rollOver +selected +showcase +solutions +support +tabHolder +textColor http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/src/test/resources/test2.swf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.swf b/nutch-plugins/parse-swf/src/test/resources/test2.swf new file mode 100644 index 0000000..eb9b03d Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test2.swf differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/src/test/resources/test2.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.txt b/nutch-plugins/parse-swf/src/test/resources/test2.txt new file mode 100644 index 0000000..f77b78a --- /dev/null +++ b/nutch-plugins/parse-swf/src/test/resources/test2.txt @@ -0,0 +1,5 @@ +Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini +-------- +TextFormat +color +font http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/src/test/resources/test3.swf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.swf b/nutch-plugins/parse-swf/src/test/resources/test3.swf new file mode 100644 index 0000000..4df9f1e Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test3.swf differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-swf/src/test/resources/test3.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.txt b/nutch-plugins/parse-swf/src/test/resources/test3.txt new file mode 100644 index 0000000..66ae3d8 --- /dev/null +++ b/nutch-plugins/parse-swf/src/test/resources/test3.txt @@ -0,0 +1,11 @@ +Mix. + Edit. + Master. + Compose. + Animate. + With a single suite of powerful tools + that work together as one. + World-class video and audio tools that bring + new power and efficiency to your film, video, + DVD, and web workflows. + Learn more. http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/encrypted.pdf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/encrypted.pdf b/nutch-plugins/parse-tika/sample/encrypted.pdf deleted file mode 100644 index 383cebb..0000000 Binary files a/nutch-plugins/parse-tika/sample/encrypted.pdf and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/nutch.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/nutch.html b/nutch-plugins/parse-tika/sample/nutch.html deleted file mode 100644 index 0aa7c98..0000000 --- a/nutch-plugins/parse-tika/sample/nutch.html +++ /dev/null @@ -1,519 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> -<html> -<head> -<META http-equiv="Content-Type" content="text/html; charset=UTF-8"> -<meta content="Apache Forrest" name="Generator"> -<meta name="Forrest-version" content="0.8"> -<meta name="Forrest-skin-name" content="lucene"> -<title>Welcome to Nutch!</title> -<link type="text/css" href="skin/basic.css" rel="stylesheet"> -<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet"> -<link media="print" type="text/css" href="skin/print.css" rel="stylesheet"> -<link type="text/css" href="skin/profile.css" rel="stylesheet"> -<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script> -<link rel="shortcut icon" href="images/favicon.ico"> -</head> -<body onload="init()"> -<script type="text/javascript">ndeSetTextSize();</script> -<div id="top"> -<!--+ - |breadtrail - +--> -<div class="breadtrail"> -<a href="http://www.apache.org/">Apache</a> > <a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script> -</div> -<!--+ - |header - +--> -<div class="header"> -<!--+ - |start group logo - +--> -<div class="grouplogo"> -<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a> -</div> -<!--+ - |end group logo - +--> -<!--+ - |start Project Logo - +--> -<div class="projectlogo"> -<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a> -</div> -<!--+ - |end Project Logo - +--> -<!--+ - |start Search - +--> -<div class="searchbox"> -<form action="http://search.lucidimagination.com/p:nutch" method="get" class="roundtopsmall"> -<input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query" type="text" value="Search the site with Solr"> - <input name="Search" value="Search" type="submit"> -</form> -<div style="position: relative; top: -5px; left: -10px">Powered by <a href="http://www.lucidimagination.com" style="color: #033268">Lucid Imagination</a> -</div> -</div> -<!--+ - |end search - +--> -<!--+ - |start Tabs - +--> -<ul id="tabs"> -<li class="current"> -<a class="selected" href="index.html">Main</a> -</li> -<li> -<a class="unselected" href="http://wiki.apache.org/nutch/">Wiki</a> -</li> -<li> -<a class="unselected" href="http://issues.apache.org/jira/browse/Nutch">Jira</a> -</li> -</ul> -<!--+ - |end Tabs - +--> -</div> -</div> -<div id="main"> -<div id="publishedStrip"> -<!--+ - |start Subtabs - +--> -<div id="level2tabs"></div> -<!--+ - |end Endtabs - +--> -<script type="text/javascript"><!-- -document.write("Last Published: " + document.lastModified); -// --></script> -</div> -<!--+ - |breadtrail - +--> -<div class="breadtrail"> - - - </div> -<!--+ - |start Menu, mainarea - +--> -<!--+ - |start Menu - +--> -<div id="menu"> -<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Project</div> -<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;"> -<div class="menupage"> -<div class="menupagetitle">News</div> -</div> -<div class="menuitem"> -<a href="about.html">About</a> -</div> -<div class="menuitem"> -<a href="credits.html">Credits</a> -</div> -<div class="menuitem"> -<a href="http://www.cafepress.com/nutch/">Buy Stuff</a> -</div> -</div> -<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div> -<div id="menu_1.2" class="menuitemgroup"> -<div class="menuitem"> -<a href="http://wiki.apache.org/nutch/FAQ">FAQ</a> -</div> -<div class="menuitem"> -<a href="http://wiki.apache.org/nutch/">Wiki</a> -</div> -<div class="menuitem"> -<a href="tutorial.html">Tutorial (0.7.2)</a> -</div> -<div class="menuitem"> -<a href="tutorial8.html">Tutorial (0.8.x)</a> -</div> -<div class="menuitem"> -<a href="bot.html">Robot </a> -</div> -<div class="menuitem"> -<a href="i18n.html">i18n</a> -</div> -<div class="menuitem"> -<a href="apidocs-1.0/index.html">API Docs (1.0)</a> -</div> -<div class="menuitem"> -<a href="apidocs-0.9/index.html">API Docs (0.9)</a> -</div> -<div class="menuitem"> -<a href="apidocs-0.8.x/index.html">API Docs (0.8.x)</a> -</div> -<div class="menuitem"> -<a href="apidocs/index.html">API Docs (0.7.2)</a> -</div> -<div class="menuitem"> -<a href="http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html">API Docs (nightly)</a> -</div> -</div> -<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div> -<div id="menu_1.3" class="menuitemgroup"> -<div class="menuitem"> -<a href="release/">Download</a> -</div> -<div class="menuitem"> -<a href="nightly.html">Nightly builds</a> -</div> -<div class="menuitem"> -<a href="mailing_lists.html">Mailing Lists</a> -</div> -<div class="menuitem"> -<a href="issue_tracking.html">Issue Tracking</a> -</div> -<div class="menuitem"> -<a href="version_control.html">Version Control</a> -</div> -</div> -<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div> -<div id="menu_1.4" class="menuitemgroup"> -<div class="menuitem"> -<a href="http://lucene.apache.org/java/">Lucene Java</a> -</div> -<div class="menuitem"> -<a href="http://lucene.apache.org/hadoop/">Hadoop</a> -</div> -<div class="menuitem"> -<a href="http://incubator.apache.org/solr/">Solr</a> -</div> -</div> -<div id="credit"> -<hr> -<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a> -</div> -<div id="roundbottom"> -<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div> -<!--+ - |alternative credits - +--> -<div id="credit2"></div> -</div> -<!--+ - |end Menu - +--> -<!--+ - |start content - +--> -<div id="content"> -<div title="Portable Document Format" class="pdflink"> -<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br> - PDF</a> -</div> -<h1>Welcome to Nutch!</h1> -<div id="minitoc-area"> -<ul class="minitoc"> -<li> -<a href="#News">News</a> -<ul class="minitoc"> -<li> -<a href="#14+August+2009+-+Lucene+at+US+ApacheCon">14 August 2009 - Lucene at US ApacheCon</a> -</li> -<li> -<a href="#23+March+2009+-+Apache+Nutch+1.0+Released">23 March 2009 - Apache Nutch 1.0 Released</a> -</li> -<li> -<a href="#09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam">09 February 2009 - Lucene at ApacheCon Europe 2009 in - Amsterdam</a> -</li> -<li> -<a href="#2+April+2007%3A+Nutch+0.9+Released">2 April 2007: Nutch 0.9 Released</a> -</li> -<li> -<a href="#24+September+2006%3A+Nutch+0.8.1+Released">24 September 2006: Nutch 0.8.1 Released</a> -</li> -<li> -<a href="#25+July+2006%3A+Nutch+0.8+Released">25 July 2006: Nutch 0.8 Released</a> -</li> -<li> -<a href="#31+March+2006%3A+Nutch+0.7.2+Released">31 March 2006: Nutch 0.7.2 Released</a> -</li> -<li> -<a href="#1+October+2005%3A+Nutch+0.7.1+Released">1 October 2005: Nutch 0.7.1 Released</a> -</li> -<li> -<a href="#17+August+2005%3A+Nutch+0.7+Released">17 August 2005: Nutch 0.7 Released</a> -</li> -<li> -<a href="#June+2005%3A+Nutch+graduates+from+Incubator">June 2005: Nutch graduates from Incubator</a> -</li> -<li> -<a href="#January+2005%3A+Nutch+Joins+Apache+Incubator">January 2005: Nutch Joins Apache Incubator</a> -</li> -<li> -<a href="#September+2004%3A+Creative+Commons+launches+Nutch-based+Search">September 2004: Creative Commons launches Nutch-based Search</a> -</li> -<li> -<a href="#September+2004%3A+Oregon+State+University+switches+to+Nutch">September 2004: Oregon State University switches to Nutch</a> -</li> -</ul> -</li> -</ul> -</div> - - -<a name="N1000D"></a><a name="News"></a> -<h2 class="h3">News</h2> -<div class="section"> -<a name="N10013"></a><a name="14+August+2009+-+Lucene+at+US+ApacheCon"></a> -<h3 class="h4">14 August 2009 - Lucene at US ApacheCon</h3> -<p> - -<a href="http://www.us.apachecon.com/c/acus2009/" title="ApacheCon US 2009"> - <img alt="ApacheCon Logo" class="float-right" src="http://www.apache.org/events/current-event-125x125.png"> - </a> - ApacheCon US is once again in the Bay Area and Lucene is coming - along for the ride! The Lucene community has planned two full - days of talks, plus a meetup and the usual bevy of training. - With a well-balanced mix of first time and veteran ApacheCon - speakers, the - <a href="http://www.us.apachecon.com/c/acus2009/schedule#lucene">Lucene track</a> - at ApacheCon US promises to have something for everyone. Be sure - not to miss: - </p> -<p> Training:</p> -<ul> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/437">Lucene Boot Camp</a> - - A two day training session, Nov. 2nd & 3rd - </li> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/375">Solr Day</a> - - A one day training session, Nov. 2nd - </li> - -</ul> -<p>Thursday, Nov. 5th</p> -<ul> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/428">Introduction to the Lucene Ecosystem - </a> - - Grant Ingersoll @ 9:00 - </li> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/461">Lucene Basics and New Features</a> - - Michael Busch @ 10:00 - </li> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/331">Apache Solr: Out of the Box</a> - - Chris Hostetter @ 14:00 - </li> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/427">Introduction to Nutch</a> - - Andrzej Bialecki @ 15:00 - </li> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/430">Lucene and Solr Performance Tuning</a> - - Mark Miller @ 16:30 - </li> - -</ul> -<p>Friday, Nov. 6th</p> -<ul> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/332">Implementing an Information Retrieval - Framework for an Organizational Repository</a> - - Sithu D Sudarsan @ 9:00 - </li> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/333">Apache Mahout - Going from raw data to - Information</a> - - Isabel Drost @ 10:00 - </li> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/334">MIME Magic with Apache Tika</a> - - Jukka Zitting @ 11:30 - </li> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/335">Building Intelligent Search Applications - with the Lucene Ecosystem</a> - - Ted Dunning @ 14:00 - </li> - -<li> - -<a href="http://www.us.apachecon.com/c/acus2009/sessions/462">Realtime Search</a> - - Jason Rutherglen @ 15:00 - </li> - -</ul> -<a name="N10091"></a><a name="23+March+2009+-+Apache+Nutch+1.0+Released"></a> -<h3 class="h4">23 March 2009 - Apache Nutch 1.0 Released</h3> -<p>The 1.0 release of Nutch is now available. This release includes several major feature improvements - such as new indexing framework, new scoring framework, Apache Solr integration just to mention a few. - See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-1.0.txt"> - list of changes</a> made in this version. The release is available - <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> -<a name="N100A3"></a><a name="09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam"></a> -<h3 class="h4">09 February 2009 - Lucene at ApacheCon Europe 2009 in - Amsterdam</h3> -<p> - -<a href="http://www.eu.apachecon.com/c/aceu2009/" title="ApacheCon EU 2009"> - <img alt="ApacheCon EU 2009 Logo" class="float-right" src="http://www.eu.apachecon.com/page_attachments/0000/0115/125x125_basic.gif"> - </a> - - Lucene will be extremely well represented at - <a href="http://www.eu.apachecon.com/c/aceu2009/">ApacheCon EU 2009</a> - in Amsterdam, Netherlands this March 23-27, 2009: - </p> -<ul> - -<li> - -<a href="http://eu.apachecon.com/c/aceu2009/sessions/197">Lucene Boot Camp</a> - - A two day training session, March 23 & 24th</li> - -<li> -<a href="http://eu.apachecon.com/c/aceu2009/sessions/201">Solr Boot Camp</a> - A one day training session, March 24th</li> - -<li> -<a href="http://eu.apachecon.com/c/aceu2009/sessions/136">Introducing Apache Mahout</a> - Grant Ingersoll. March 25th @ 10:30</li> - -<li> -<a href="http://eu.apachecon.com/c/aceu2009/sessions/137">Lucene/Solr Case Studies</a> - Erik Hatcher. March 25th @ 11:30</li> - -<li> -<a href="http://eu.apachecon.com/c/aceu2009/sessions/138">Advanced Indexing Techniques with Apache Lucene</a> - Michael Busch. March 25th @ 14:00</li> - -<li> -<a href="http://eu.apachecon.com/c/aceu2009/sessions/251">Apache Solr - A Case Study</a> - Uri Boness. March 26th @ 17:30</li> - -<li> -<a href="http://eu.apachecon.com/c/aceu2009/sessions/250">Best of breed - httpd, forrest, solr and droids</a> - Thorsten Scherler. March 27th @ 17:30</li> - -<li> -<a href="http://eu.apachecon.com/c/aceu2009/sessions/165">Apache Droids - an intelligent standalone robot framework</a> - Thorsten Scherler. March 26th @ 15:00</li> - - -</ul> -<a name="N100EF"></a><a name="2+April+2007%3A+Nutch+0.9+Released"></a> -<h3 class="h4">2 April 2007: Nutch 0.9 Released</h3> -<p>The 0.9 release of Nutch is now available. This is the second release of Nutch - based entirely on the underlying Hadoop platform. This release includes several critical - bug fixes, as well as key speedups described in more detail at - <a href="http://blog.foofactory.fi/2007/03/twice-speed-half-size.html">Sami Siren's blog</a>. - See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.9.txt"> - list of changes</a> made in this version. The release is available - <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> -<a name="N10105"></a><a name="24+September+2006%3A+Nutch+0.8.1+Released"></a> -<h3 class="h4">24 September 2006: Nutch 0.8.1 Released</h3> -<p>The 0.8.1 release of Nutch is now available. This is a maintenance release to 0.8 branch fixing many serous bugs found in version 0.8. - See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.8.1.txt"> - list of changes</a> made in this version. The release is available - <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> -<a name="N10117"></a><a name="25+July+2006%3A+Nutch+0.8+Released"></a> -<h3 class="h4">25 July 2006: Nutch 0.8 Released</h3> -<p>The 0.8 release of Nutch is now available. This is the first release of Nutch based on - hadoop architecure. See <a href="http://svn.apache.org/viewvc/lucene/nutch/tags/release-0.8/CHANGES.txt?view=markup"> - CHANGES.txt</a> for list of changes made in this version. The release is available - <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> -<a name="N10129"></a><a name="31+March+2006%3A+Nutch+0.7.2+Released"></a> -<h3 class="h4">31 March 2006: Nutch 0.7.2 Released</h3> -<p>The 0.7.2 release of Nutch is now available. This is a bug fix release for 0.7 branch. See - <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=390158"> - CHANGES.txt</a> for details. The release is available - <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> -<a name="N1013B"></a><a name="1+October+2005%3A+Nutch+0.7.1+Released"></a> -<h3 class="h4">1 October 2005: Nutch 0.7.1 Released</h3> -<p>The 0.7.1 release of Nutch is now available. This is a bug fix release. See - <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986"> - CHANGES.txt</a> for details. The release is available - <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> -<a name="N1014D"></a><a name="17+August+2005%3A+Nutch+0.7+Released"></a> -<h3 class="h4">17 August 2005: Nutch 0.7 Released</h3> -<p>This is the first Nutch release as an Apache Lucene sub-project. See - <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/CHANGES.txt?rev=233150"> - CHANGES.txt</a> for details. The release is available - <a href="http://lucene.apache.org/nutch/release/">here</a>.</p> -<a name="N1015F"></a><a name="June+2005%3A+Nutch+graduates+from+Incubator"></a> -<h3 class="h4">June 2005: Nutch graduates from Incubator</h3> -<p>Nutch has now graduated from the Apache incubator, and is now - a Subproject of Lucene.</p> -<a name="N10169"></a><a name="January+2005%3A+Nutch+Joins+Apache+Incubator"></a> -<h3 class="h4">January 2005: Nutch Joins Apache Incubator</h3> -<p>Nutch is a two-year-old open source project, previously - hosted at Sourceforge and backed by its own non-profit - organization. The non-profit was founded in order to assign - copyright, so that we could retain the right to change the - license. We have now determined that the Apache license is the - appropriate license for Nutch and no longer require the - overhead of an independent non-profit organization. Nutch's - board of directors and its developers were both polled and - supported the move to the Apache foundation.</p> -<a name="N10173"></a><a name="September+2004%3A+Creative+Commons+launches+Nutch-based+Search"></a> -<h3 class="h4">September 2004: Creative Commons launches Nutch-based Search</h3> -<p>Creative Commons unveiled a beta version of its search - engine, which scours the web for text, images, audio, and video - free to re-use on certain terms a search refinement offered by - no other company or organization.</p> -<p>See the <a href="http://creativecommons.org/press-releases/entry/5064">Creative - Commons Press Release</a> for more details.</p> -<a name="N10184"></a><a name="September+2004%3A+Oregon+State+University+switches+to+Nutch"></a> -<h3 class="h4">September 2004: Oregon State University switches to Nutch</h3> -<p>Oregon State University is converting its searching - infrastructure from Googletm to the open source project - Nutch. The effort to replace the Googletm will realize - significant cost savings for Oregon State University, while - promoting both the Nutch Search Engine and transparency in - search engine use and management.</p> -<p>For more details see the announcement by OSU's <a href="http://osuosl.org/news_folder/nutch">Open Source - Lab</a>.</p> -</div> - - -</div> -<!--+ - |end content - +--> -<div class="clearboth"> </div> -</div> -<div id="footer"> -<!--+ - |start bottomstrip - +--> -<div class="lastmodified"> -<script type="text/javascript"><!-- -document.write("Last Published: " + document.lastModified); -// --></script> -</div> -<div class="copyright"> - Copyright © - 2006 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a> -</div> -<div id="logos"></div> -<!--+ - |end bottomstrip - +--> -</div> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/nutch_logo_tm.gif ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/nutch_logo_tm.gif b/nutch-plugins/parse-tika/sample/nutch_logo_tm.gif deleted file mode 100644 index 0545a60..0000000 Binary files a/nutch-plugins/parse-tika/sample/nutch_logo_tm.gif and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/ootest.odt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/ootest.odt b/nutch-plugins/parse-tika/sample/ootest.odt deleted file mode 100644 index e36e389..0000000 Binary files a/nutch-plugins/parse-tika/sample/ootest.odt and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/ootest.sxw ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/ootest.sxw b/nutch-plugins/parse-tika/sample/ootest.sxw deleted file mode 100644 index 260b1c2..0000000 Binary files a/nutch-plugins/parse-tika/sample/ootest.sxw and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/ootest.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/ootest.txt b/nutch-plugins/parse-tika/sample/ootest.txt deleted file mode 100644 index 685f89a..0000000 --- a/nutch-plugins/parse-tika/sample/ootest.txt +++ /dev/null @@ -1,30 +0,0 @@ -Abcedfg ????? -Abcdefg -Abcdefg -abcdefg - - - - - - - - - - - http://www.openoffice.org - -Title -Col1 -Col2 -Col3 -head -Cell1 -Cell2 -Cel3 -total -TOTAL -TOTAL -TOTAL - -Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer a leo in lacus malesuada ornare. Mauris sagittis. Nam vestibulum. Nunc gravida vestibulum augue. Praesent sed lectus quis lectus adipiscing bibendum. Sed nulla. Duis posuere justo eget urna. Proin lorem orci, vestibulum ut, consequat molestie, eleifend a, nibh. Mauris sed lacus. Etiam blandit tincidunt neque. Cras ac sapien. Duis erat. http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/pdftest.pdf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/pdftest.pdf b/nutch-plugins/parse-tika/sample/pdftest.pdf deleted file mode 100644 index e7c6e62..0000000 --- a/nutch-plugins/parse-tika/sample/pdftest.pdf +++ /dev/null @@ -1,157 +0,0 @@ -%PDF-1.2 -%���� - -9 0 obj -<< -/Length 10 0 R -/Filter /FlateDecode ->> -stream -H�Í�J�0�� ��{��f�$M��n�-���[&je���ۤ �~�$���}ï¿½É ï¿½Ij���s����~�X�-],��$Y���)�'N�u�1!���V�?��? -�b1Rbb�Ò�H�[��TD:#�&Øï¿½ï¿½X���i�$qnf�����]������a��{��أ���q|J�Ls]�Q�I��j�%��9��`�঺��U�ite�z�$����OeB�Äү�R��@zÜ���g���<��� -endstream -endobj -10 0 obj -246 -endobj -4 0 obj -<< -/Type /Page -/Parent 5 0 R -/Resources << -/Font << -/F0 6 0 R -/F1 7 0 R ->> -/ProcSet 2 0 R ->> -/Contents 9 0 R ->> -endobj -6 0 obj -<< -/Type /Font -/Subtype /TrueType -/Name /F0 -/BaseFont /Arial -/Encoding /WinAnsiEncoding ->> -endobj -7 0 obj -<< -/Type /Font -/Subtype /TrueType -/Name /F1 -/BaseFont /BookAntiqua,Bold -/FirstChar 31 -/LastChar 255 -/Widths [ 750 250 278 402 606 500 889 833 227 333 333 444 606 250 333 250 -296 500 500 500 500 500 500 500 500 500 500 250 250 606 606 606 -444 747 778 667 722 833 611 556 833 833 389 389 778 611 1000 833 -833 611 833 722 611 667 778 778 1000 667 667 667 333 606 333 606 -500 333 500 611 444 611 500 389 556 611 333 333 611 333 889 611 -556 611 611 389 444 333 611 556 833 500 556 500 310 606 310 606 -750 500 750 333 500 500 1000 500 500 333 1000 611 389 1000 750 750 -750 750 278 278 500 500 606 500 1000 333 998 444 389 833 750 750 -667 250 278 500 500 606 500 606 500 333 747 438 500 606 333 747 -500 400 549 361 361 333 576 641 250 333 361 488 500 889 890 889 -444 778 778 778 778 778 778 1000 722 611 611 611 611 389 389 389 -389 833 833 833 833 833 833 833 606 833 778 778 778 778 667 611 -611 500 500 500 500 500 500 778 444 500 500 500 500 333 333 333 -333 556 611 556 556 556 556 556 549 556 611 611 611 611 556 611 -556 ] -/Encoding /WinAnsiEncoding -/FontDescriptor 8 0 R ->> -endobj -8 0 obj -<< -/Type /FontDescriptor -/FontName /BookAntiqua,Bold -/Flags 16418 -/FontBBox [ -250 -260 1236 930 ] -/MissingWidth 750 -/StemV 146 -/StemH 146 -/ItalicAngle 0 -/CapHeight 930 -/XHeight 651 -/Ascent 930 -/Descent 260 -/Leading 210 -/MaxWidth 1030 -/AvgWidth 460 ->> -endobj -2 0 obj -[ /PDF /Text ] -endobj -5 0 obj -<< -/Kids [4 0 R ] -/Count 1 -/Type /Pages -/MediaBox [ 0 0 612 792 ] ->> -endobj -1 0 obj -<< -/Creator (1725.fm) -/CreationDate (1-Jan-3 18:15PM) -/Title (1725.PDF) -/Author (Unknown) -/Producer (Acrobat PDFWriter 3.02 for Windows) -/Keywords () -/Subject () ->> -endobj -3 0 obj -<< -/Pages 5 0 R -/Type /Catalog -/DefaultGray 11 0 R -/DefaultRGB 12 0 R ->> -endobj -11 0 obj -[/CalGray -<< -/WhitePoint [0.9505 1 1.0891 ] -/Gamma 0.2468 ->> -] -endobj -12 0 obj -[/CalRGB -<< -/WhitePoint [0.9505 1 1.0891 ] -/Gamma [0.2468 0.2468 0.2468 ] -/Matrix [0.4361 0.2225 0.0139 0.3851 0.7169 0.0971 0.1431 0.0606 0.7141 ] ->> -] -endobj -xref -0 13 -0000000000 65535 f -0000002172 00000 n -0000002046 00000 n -0000002363 00000 n -0000000375 00000 n -0000002080 00000 n -0000000518 00000 n -0000000633 00000 n -0000001760 00000 n -0000000021 00000 n -0000000352 00000 n -0000002460 00000 n -0000002548 00000 n -trailer -<< -/Size 13 -/Root 3 0 R -/Info 1 0 R -/ID [<47149510433dd4882f05f8c124223734><47149510433dd4882f05f8c124223734>] ->> -startxref -2726 -%%EOF http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/rsstest.rss ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/rsstest.rss b/nutch-plugins/parse-tika/sample/rsstest.rss deleted file mode 100644 index 6c4ae48..0000000 --- a/nutch-plugins/parse-tika/sample/rsstest.rss +++ /dev/null @@ -1,37 +0,0 @@ -<?xml version="1.0" encoding="ISO-8859-1" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<rss version="0.91"> - <channel> - <title>TestChannel</title> - <link>http://test.channel.com/</link> - <description>Sample RSS File for Junit test</description> - <language>en-us</language> - - <item> - <title>Home Page of Chris Mattmann</title> - <link>http://www-scf.usc.edu/~mattmann/</link> - <description>Chris Mattmann's home page</description> - </item> - - <item> - <title>Awesome Open Source Search Engine</title> - <link>http://www.nutch.org/</link> - <description>Yup, that's what it is</description> - </item> - </channel> -</rss> http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/test.rtf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/test.rtf b/nutch-plugins/parse-tika/sample/test.rtf deleted file mode 100644 index c67a6c8..0000000 --- a/nutch-plugins/parse-tika/sample/test.rtf +++ /dev/null @@ -1,17 +0,0 @@ -{\rtf1\ansi\deff1\adeflang1025 -{\fonttbl{\f0\froman\fprq2\fcharset0 Times;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\fmodern\fprq1\fcharset0 Courier New;}{\f3\froman\fprq2\fcharset0 Times New Roman;}{\f4\fnil\fprq2\fcharset0 Interface User;}{\f5\fnil\fprq2\fcharset0 Lucidasans;}{\f6\fnil\fprq0\fcharset0 Lucidasans;}} -{\colortbl;\red0\green0\blue0;\red0\green0\blue128;\red128\green128\blue128;} -{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033\snext1 Default;} -{\s2\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext2 Text body;} -{\s3\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af1\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon2\snext3 List;} -{\s4\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs20\lang255\ai\ltrch\dbch\afs20\langfe255\ai\loch\f1\fs20\lang1033\i\sbasedon1\snext4 Caption;} -{\s5\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext5 Index;} -{\*\cs7\cf0\rtlch\af2\afs24\lang255\ltrch\dbch\af2\afs24\langfe255\loch\f2\fs24\lang1033 Teletype;} -{\*\cs8\cf2\ul\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\fs24\lang1033 Internet Link;} -} -{\info{\title test rft document}{\subject tests}{\creatim\yr2004\mo9\dy20\hr19\min36}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6450}}\deftab709 -{\*\pgdsctbl -{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\pgdscnxt0 Default;}} -{\*\pgdscno0}\paperh16837\paperw11905\margl1800\margr1800\margt1440\margb1440\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc -\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\ql\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033{\loch\f2\fs24\lang1033\i0\b0\*\cs7\cf0\rtlch\ltrch\dbch\loch\f2\fs24\lang1033 The quick brown fox jumps over the lazy dog} -\par } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/sample/word97.doc ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/sample/word97.doc b/nutch-plugins/parse-tika/sample/word97.doc deleted file mode 100644 index 4d012da..0000000 Binary files a/nutch-plugins/parse-tika/sample/word97.doc and /dev/null differ http://git-wip-us.apache.org/repos/asf/nutch/blob/3f1cf76f/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf b/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf new file mode 100644 index 0000000..383cebb Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/encrypted.pdf differ
