Re arranged the source code as per maven conventions for build
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/ffa16784 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/ffa16784 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/ffa16784 Branch: refs/heads/NUTCH-2292 Commit: ffa167843999d6434d62ed7f636c9c9ae2eff080 Parents: 4eaeeb6 Author: Thamme Gowda <[email protected]> Authored: Tue Jul 5 15:02:59 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Tue Jul 5 15:02:59 2016 -0700 ---------------------------------------------------------------------- .gitignore | 6 + .../resources/fetch-test-site/dup_of_pagea.html | 11 + .../resources/fetch-test-site/exception.html | 13 + .../test/resources/fetch-test-site/index.html | 13 + .../fetch-test-site/nested_spider_trap.html | 23 + .../test/resources/fetch-test-site/pagea.html | 11 + .../test/resources/fetch-test-site/pageb.html | 11 + .../test/resources/fetch-test-site/robots.txt | 0 .../src/test/resources/test-mime-util/test.xlsx | Bin 0 -> 3950 bytes .../20150309101625/content/part-00000/.data.crc | Bin 0 -> 124 bytes .../content/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101625/content/part-00000/data | Bin 0 -> 14452 bytes .../20150309101625/content/part-00000/index | Bin 0 -> 217 bytes .../crawl_fetch/part-00000/.data.crc | Bin 0 -> 12 bytes .../crawl_fetch/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101625/crawl_fetch/part-00000/data | Bin 0 -> 293 bytes .../20150309101625/crawl_fetch/part-00000/index | Bin 0 -> 217 bytes .../crawl_generate/.part-00000.crc | Bin 0 -> 12 bytes .../20150309101625/crawl_generate/part-00000 | Bin 0 -> 169 bytes .../20150309101625/crawl_parse/.part-00000.crc | Bin 0 -> 68 bytes .../20150309101625/crawl_parse/part-00000 | Bin 0 -> 7627 bytes .../parse_data/part-00000/.data.crc | Bin 0 -> 24 bytes .../parse_data/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101625/parse_data/part-00000/data | Bin 0 -> 1985 bytes .../20150309101625/parse_data/part-00000/index | Bin 0 -> 217 bytes .../parse_text/part-00000/.data.crc | Bin 0 -> 60 bytes .../parse_text/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101625/parse_text/part-00000/data | Bin 0 -> 6554 bytes .../20150309101625/parse_text/part-00000/index | Bin 0 -> 217 bytes .../20150309101656/content/part-00000/.data.crc | Bin 0 -> 3372 bytes .../content/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101656/content/part-00000/data | Bin 0 -> 430250 bytes .../20150309101656/content/part-00000/index | Bin 0 -> 220 bytes .../crawl_fetch/part-00000/.data.crc | Bin 0 -> 104 bytes .../crawl_fetch/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101656/crawl_fetch/part-00000/data | Bin 0 -> 12121 bytes .../20150309101656/crawl_fetch/part-00000/index | Bin 0 -> 220 bytes .../crawl_generate/.part-00000.crc | Bin 0 -> 52 bytes .../20150309101656/crawl_generate/part-00000 | Bin 0 -> 5590 bytes .../20150309101656/crawl_parse/.part-00000.crc | Bin 0 -> 1652 bytes .../20150309101656/crawl_parse/part-00000 | Bin 0 -> 210047 bytes .../parse_data/part-00000/.data.crc | Bin 0 -> 460 bytes .../parse_data/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101656/parse_data/part-00000/data | Bin 0 -> 57355 bytes .../20150309101656/parse_data/part-00000/index | Bin 0 -> 220 bytes .../parse_text/part-00000/.data.crc | Bin 0 -> 1260 bytes .../parse_text/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101656/parse_text/part-00000/data | Bin 0 -> 159920 bytes .../20150309101656/parse_text/part-00000/index | Bin 0 -> 220 bytes nutch-plugins/build-plugin.xml | 255 ++++++ nutch-plugins/build.xml | 213 +++++ nutch-plugins/creativecommons/README.txt | 1 + nutch-plugins/creativecommons/build.xml | 28 + .../creativecommons/conf/crawl-urlfilter.txt | 18 + .../creativecommons/conf/nutch-site.xml | 50 ++ nutch-plugins/creativecommons/data/anchor.html | 9 + nutch-plugins/creativecommons/data/rdf.html | 35 + nutch-plugins/creativecommons/data/rel.html | 6 + nutch-plugins/creativecommons/ivy.xml | 41 + nutch-plugins/creativecommons/plugin.xml | 48 ++ nutch-plugins/creativecommons/pom.xml | 38 + .../creativecommons/nutch/CCIndexingFilter.java | 124 +++ .../creativecommons/nutch/CCParseFilter.java | 300 +++++++ .../java/org/creativecommons/nutch/package.html | 5 + .../nutch/TestCCParseFilter.java | 73 ++ nutch-plugins/feed/build.xml | 45 ++ nutch-plugins/feed/ivy.xml | 43 + nutch-plugins/feed/plugin.xml | 49 ++ nutch-plugins/feed/pom.xml | 45 ++ nutch-plugins/feed/sample/rsstest.rss | 36 + .../nutch/indexer/feed/FeedIndexingFilter.java | 129 +++ .../apache/nutch/indexer/feed/package-info.java | 22 + .../org/apache/nutch/parse/feed/FeedParser.java | 374 +++++++++ .../apache/nutch/parse/feed/package-info.java | 22 + .../apache/nutch/parse/feed/TestFeedParser.java | 124 +++ nutch-plugins/headings/build.xml | 22 + nutch-plugins/headings/ivy.xml | 41 + nutch-plugins/headings/plugin.xml | 45 ++ nutch-plugins/headings/pom.xml | 38 + .../parse/headings/HeadingsParseFilter.java | 124 +++ .../nutch/parse/headings/package-info.java | 22 + nutch-plugins/index-anchor/build.xml | 22 + nutch-plugins/index-anchor/ivy.xml | 41 + nutch-plugins/index-anchor/plugin.xml | 38 + nutch-plugins/index-anchor/pom.xml | 38 + .../indexer/anchor/AnchorIndexingFilter.java | 107 +++ .../apache/nutch/indexer/anchor/package.html | 5 + .../anchor/TestAnchorIndexingFilter.java | 67 ++ nutch-plugins/index-basic/build.xml | 22 + nutch-plugins/index-basic/ivy.xml | 41 + nutch-plugins/index-basic/plugin.xml | 42 + nutch-plugins/index-basic/pom.xml | 38 + .../indexer/basic/BasicIndexingFilter.java | 158 ++++ .../org/apache/nutch/indexer/basic/package.html | 5 + .../indexer/basic/TestBasicIndexingFilter.java | 99 +++ nutch-plugins/index-geoip/build-ivy.xml | 54 ++ nutch-plugins/index-geoip/build.xml | 27 + nutch-plugins/index-geoip/ivy.xml | 46 ++ nutch-plugins/index-geoip/plugin.xml | 51 ++ nutch-plugins/index-geoip/pom.xml | 55 ++ .../indexer/geoip/GeoIPDocumentCreator.java | 210 +++++ .../indexer/geoip/GeoIPIndexingFilter.java | 241 ++++++ .../nutch/indexer/geoip/package-info.java | 28 + nutch-plugins/index-links/build.xml | 22 + nutch-plugins/index-links/ivy.xml | 41 + nutch-plugins/index-links/plugin.xml | 41 + nutch-plugins/index-links/pom.xml | 38 + .../indexer/links/LinksIndexingFilter.java | 167 ++++ .../indexer/links/TestLinksIndexingFilter.java | 218 +++++ .../org/apache/nutch/parse/TestOutlinks.java | 54 ++ nutch-plugins/index-metadata/build.xml | 22 + nutch-plugins/index-metadata/ivy.xml | 41 + nutch-plugins/index-metadata/plugin.xml | 42 + nutch-plugins/index-metadata/pom.xml | 38 + .../nutch/indexer/metadata/MetadataIndexer.java | 104 +++ .../nutch/indexer/metadata/package-info.java | 23 + nutch-plugins/index-more/build.xml | 22 + nutch-plugins/index-more/ivy.xml | 41 + nutch-plugins/index-more/plugin.xml | 42 + nutch-plugins/index-more/pom.xml | 38 + .../nutch/indexer/more/MoreIndexingFilter.java | 344 ++++++++ .../org/apache/nutch/indexer/more/package.html | 6 + .../indexer/more/TestMoreIndexingFilter.java | 123 +++ nutch-plugins/index-replace/README.txt | 95 +++ nutch-plugins/index-replace/build.xml | 55 ++ nutch-plugins/index-replace/ivy.xml | 41 + nutch-plugins/index-replace/plugin.xml | 22 + nutch-plugins/index-replace/pom.xml | 38 + .../index-replace/sample/testIndexReplace.html | 12 + .../nutch/indexer/replace/FieldReplacer.java | 196 +++++ .../nutch/indexer/replace/ReplaceIndexer.java | 330 ++++++++ .../nutch/indexer/replace/package-info.java | 22 + .../nutch/indexer/replace/TestIndexReplace.java | 456 +++++++++++ nutch-plugins/index-static/build.xml | 22 + nutch-plugins/index-static/ivy.xml | 41 + nutch-plugins/index-static/plugin.xml | 42 + nutch-plugins/index-static/pom.xml | 38 + .../indexer/staticfield/StaticFieldIndexer.java | 143 ++++ .../nutch/indexer/staticfield/package.html | 5 + .../staticfield/TestStaticFieldIndexerTest.java | 194 +++++ nutch-plugins/indexer-cloudsearch/README.md | 58 ++ nutch-plugins/indexer-cloudsearch/build.xml | 22 + .../indexer-cloudsearch/createCSDomain.sh | 22 + nutch-plugins/indexer-cloudsearch/ivy.xml | 41 + nutch-plugins/indexer-cloudsearch/plugin.xml | 50 ++ nutch-plugins/indexer-cloudsearch/pom.xml | 45 ++ .../cloudsearch/CloudSearchConstants.java | 27 + .../cloudsearch/CloudSearchIndexWriter.java | 382 +++++++++ .../cloudsearch/CloudSearchUtils.java | 73 ++ nutch-plugins/indexer-dummy/build.xml | 22 + nutch-plugins/indexer-dummy/ivy.xml | 41 + nutch-plugins/indexer-dummy/plugin.xml | 38 + nutch-plugins/indexer-dummy/pom.xml | 38 + .../indexwriter/dummy/DummyIndexWriter.java | 103 +++ .../nutch/indexwriter/dummy/package-info.java | 23 + nutch-plugins/indexer-elastic/build-ivy.xml | 54 ++ nutch-plugins/indexer-elastic/build.xml | 22 + .../indexer-elastic/howto_upgrade_es.txt | 6 + nutch-plugins/indexer-elastic/ivy.xml | 43 + nutch-plugins/indexer-elastic/plugin.xml | 71 ++ nutch-plugins/indexer-elastic/pom.xml | 45 ++ .../indexwriter/elastic/ElasticConstants.java | 28 + .../indexwriter/elastic/ElasticIndexWriter.java | 279 +++++++ .../nutch/indexwriter/elastic/package-info.java | 22 + nutch-plugins/indexer-solr/build-ivy.xml | 54 ++ nutch-plugins/indexer-solr/build.xml | 22 + nutch-plugins/indexer-solr/ivy.xml | 44 + nutch-plugins/indexer-solr/plugin.xml | 48 ++ nutch-plugins/indexer-solr/pom.xml | 55 ++ .../nutch/indexwriter/solr/SolrConstants.java | 56 ++ .../nutch/indexwriter/solr/SolrIndexWriter.java | 277 +++++++ .../indexwriter/solr/SolrMappingReader.java | 147 ++++ .../nutch/indexwriter/solr/SolrUtils.java | 97 +++ .../nutch/indexwriter/solr/package-info.java | 22 + nutch-plugins/language-identifier/build.xml | 38 + nutch-plugins/language-identifier/ivy.xml | 41 + nutch-plugins/language-identifier/plugin.xml | 49 ++ nutch-plugins/language-identifier/pom.xml | 38 + .../nutch/analysis/lang/HTMLLanguageParser.java | 320 ++++++++ .../analysis/lang/LanguageIndexingFilter.java | 89 +++ .../nutch/analysis/lang/langmappings.properties | 188 +++++ .../org/apache/nutch/analysis/lang/package.html | 6 + .../analysis/lang/TestHTMLLanguageParser.java | 149 ++++ .../test/org/apache/nutch/analysis/lang/da.test | 108 +++ .../test/org/apache/nutch/analysis/lang/de.test | 104 +++ .../test/org/apache/nutch/analysis/lang/el.test | 109 +++ .../test/org/apache/nutch/analysis/lang/en.test | 105 +++ .../test/org/apache/nutch/analysis/lang/es.test | 107 +++ .../test/org/apache/nutch/analysis/lang/fi.test | 106 +++ .../test/org/apache/nutch/analysis/lang/fr.test | 105 +++ .../test/org/apache/nutch/analysis/lang/it.test | 109 +++ .../test/org/apache/nutch/analysis/lang/nl.test | 105 +++ .../test/org/apache/nutch/analysis/lang/pt.test | 105 +++ .../test/org/apache/nutch/analysis/lang/sv.test | 108 +++ .../nutch/analysis/lang/test-referencial.txt | 10 + nutch-plugins/lib-htmlunit/build-ivy.xml | 54 ++ nutch-plugins/lib-htmlunit/build.xml | 28 + nutch-plugins/lib-htmlunit/ivy.xml | 52 ++ nutch-plugins/lib-htmlunit/plugin.xml | 166 ++++ nutch-plugins/lib-htmlunit/pom.xml | 55 ++ .../protocol/htmlunit/HtmlUnitWebDriver.java | 189 +++++ .../htmlunit/HtmlUnitWebWindowListener.java | 53 ++ nutch-plugins/lib-http/build.xml | 22 + nutch-plugins/lib-http/ivy.xml | 41 + nutch-plugins/lib-http/plugin.xml | 33 + nutch-plugins/lib-http/pom.xml | 38 + .../protocol/http/api/BlockedException.java | 26 + .../nutch/protocol/http/api/HttpBase.java | 587 ++++++++++++++ .../nutch/protocol/http/api/HttpException.java | 40 + .../protocol/http/api/HttpRobotRulesParser.java | 167 ++++ .../apache/nutch/protocol/http/api/package.html | 6 + .../protocol/http/api/TestRobotRulesParser.java | 123 +++ nutch-plugins/lib-nekohtml/build.xml | 30 + nutch-plugins/lib-nekohtml/ivy.xml | 42 + nutch-plugins/lib-nekohtml/plugin.xml | 38 + nutch-plugins/lib-nekohtml/pom.xml | 38 + nutch-plugins/lib-regex-filter/build.xml | 22 + nutch-plugins/lib-regex-filter/ivy.xml | 41 + nutch-plugins/lib-regex-filter/plugin.xml | 33 + nutch-plugins/lib-regex-filter/pom.xml | 38 + .../apache/nutch/urlfilter/api/RegexRule.java | 102 +++ .../nutch/urlfilter/api/RegexURLFilterBase.java | 315 ++++++++ .../nutch/urlfilter/api/package-info.java | 23 + .../urlfilter/api/RegexURLFilterBaseTest.java | 134 ++++ nutch-plugins/lib-selenium/build-ivy.xml | 54 ++ nutch-plugins/lib-selenium/build.xml | 28 + .../lib-selenium/howto_upgrade_selenium.txt | 15 + nutch-plugins/lib-selenium/ivy.xml | 52 ++ nutch-plugins/lib-selenium/plugin.xml | 175 ++++ nutch-plugins/lib-selenium/pom.xml | 49 ++ .../nutch/protocol/selenium/HttpWebClient.java | 236 ++++++ nutch-plugins/lib-xml/build.xml | 36 + nutch-plugins/lib-xml/ivy.xml | 44 + nutch-plugins/lib-xml/plugin.xml | 65 ++ nutch-plugins/lib-xml/pom.xml | 38 + nutch-plugins/microformats-reltag/build.xml | 27 + nutch-plugins/microformats-reltag/ivy.xml | 41 + nutch-plugins/microformats-reltag/plugin.xml | 49 ++ nutch-plugins/microformats-reltag/pom.xml | 38 + .../reltag/RelTagIndexingFilter.java | 77 ++ .../nutch/microformats/reltag/RelTagParser.java | 148 ++++ .../nutch/microformats/reltag/package.html | 8 + nutch-plugins/mimetype-filter/build.xml | 28 + nutch-plugins/mimetype-filter/ivy.xml | 41 + nutch-plugins/mimetype-filter/plugin.xml | 37 + nutch-plugins/mimetype-filter/pom.xml | 38 + .../mimetype-filter/sample/allow-images.txt | 34 + .../mimetype-filter/sample/block-html.txt | 34 + .../indexer/filter/MimeTypeIndexingFilter.java | 273 +++++++ .../filter/MimeTypeIndexingFilterTest.java | 114 +++ nutch-plugins/nutch-extensionpoints/build.xml | 30 + nutch-plugins/nutch-extensionpoints/ivy.xml | 41 + nutch-plugins/nutch-extensionpoints/plugin.xml | 67 ++ nutch-plugins/nutch-extensionpoints/pom.xml | 38 + nutch-plugins/parse-ext/build.xml | 32 + nutch-plugins/parse-ext/command | 24 + nutch-plugins/parse-ext/ivy.xml | 41 + nutch-plugins/parse-ext/plugin.xml | 60 ++ nutch-plugins/parse-ext/pom.xml | 38 + .../org/apache/nutch/parse/ext/ExtParser.java | 183 +++++ .../apache/nutch/parse/ext/package-info.java | 22 + .../apache/nutch/parse/ext/TestExtParser.java | 130 +++ nutch-plugins/parse-html/build.xml | 40 + nutch-plugins/parse-html/ivy.xml | 42 + nutch-plugins/parse-html/plugin.xml | 48 ++ nutch-plugins/parse-html/pom.xml | 49 ++ .../org/apache/nutch/parse/html/DOMBuilder.java | 766 ++++++++++++++++++ .../nutch/parse/html/DOMContentUtils.java | 400 ++++++++++ .../nutch/parse/html/HTMLMetaProcessor.java | 214 +++++ .../org/apache/nutch/parse/html/HtmlParser.java | 352 ++++++++ .../parse/html/XMLCharacterRecognizer.java | 112 +++ .../org/apache/nutch/parse/html/package.html | 5 + .../nutch/parse/html/TestDOMContentUtils.java | 347 ++++++++ .../apache/nutch/parse/html/TestHtmlParser.java | 122 +++ .../parse/html/TestRobotsMetaProcessor.java | 155 ++++ nutch-plugins/parse-js/build.xml | 22 + nutch-plugins/parse-js/ivy.xml | 41 + nutch-plugins/parse-js/plugin.xml | 53 ++ nutch-plugins/parse-js/pom.xml | 38 + .../apache/nutch/parse/js/JSParseFilter.java | 301 +++++++ .../org/apache/nutch/parse/js/package-info.java | 23 + nutch-plugins/parse-metatags/README.txt | 17 + nutch-plugins/parse-metatags/build.xml | 37 + nutch-plugins/parse-metatags/ivy.xml | 41 + nutch-plugins/parse-metatags/plugin.xml | 22 + nutch-plugins/parse-metatags/pom.xml | 38 + .../parse-metatags/sample/testMetatags.html | 9 + .../sample/testMultivalueMetatags.html | 12 + .../nutch/parse/metatags/MetaTagsParser.java | 124 +++ .../nutch/parse/metatags/package-info.java | 24 + .../nutch/parse/metatags/TestMetatagParser.java | 104 +++ nutch-plugins/parse-replace/README.txt | 91 +++ nutch-plugins/parse-replace/build.xml | 37 + nutch-plugins/parse-replace/ivy.xml | 41 + nutch-plugins/parse-replace/plugin.xml | 22 + nutch-plugins/parse-replace/pom.xml | 38 + .../parse-replace/sample/testParseReplace.html | 11 + .../nutch/parse/replace/ReplaceParser.java | 74 ++ .../nutch/parse/replace/package-info.java | 22 + .../nutch/parse/replace/TestParseReplace.java | 68 ++ nutch-plugins/parse-swf/build.xml | 38 + nutch-plugins/parse-swf/ivy.xml | 41 + nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt | 33 + nutch-plugins/parse-swf/lib/javaswf.jar | Bin 0 -> 125369 bytes nutch-plugins/parse-swf/plugin.xml | 44 + nutch-plugins/parse-swf/pom.xml | 46 ++ nutch-plugins/parse-swf/sample/test1.swf | Bin 0 -> 21054 bytes nutch-plugins/parse-swf/sample/test1.txt | 60 ++ nutch-plugins/parse-swf/sample/test2.swf | Bin 0 -> 42534 bytes nutch-plugins/parse-swf/sample/test2.txt | 5 + nutch-plugins/parse-swf/sample/test3.swf | Bin 0 -> 51562 bytes nutch-plugins/parse-swf/sample/test3.txt | 11 + .../org/apache/nutch/parse/swf/SWFParser.java | 685 ++++++++++++++++ .../apache/nutch/parse/swf/package-info.java | 22 + .../apache/nutch/parse/swf/TestSWFParser.java | 94 +++ nutch-plugins/parse-tika/build-ivy.xml | 54 ++ nutch-plugins/parse-tika/build.xml | 55 ++ nutch-plugins/parse-tika/howto_upgrade_tika.txt | 8 + nutch-plugins/parse-tika/ivy.xml | 46 ++ nutch-plugins/parse-tika/plugin.xml | 136 ++++ nutch-plugins/parse-tika/pom.xml | 45 ++ nutch-plugins/parse-tika/sample/encrypted.pdf | Bin 0 -> 3431 bytes nutch-plugins/parse-tika/sample/nutch.html | 519 ++++++++++++ .../parse-tika/sample/nutch_logo_tm.gif | Bin 0 -> 2747 bytes nutch-plugins/parse-tika/sample/ootest.odt | Bin 0 -> 20753 bytes nutch-plugins/parse-tika/sample/ootest.sxw | Bin 0 -> 20125 bytes nutch-plugins/parse-tika/sample/ootest.txt | 30 + nutch-plugins/parse-tika/sample/pdftest.pdf | 157 ++++ nutch-plugins/parse-tika/sample/rsstest.rss | 37 + nutch-plugins/parse-tika/sample/test.rtf | 17 + nutch-plugins/parse-tika/sample/word97.doc | Bin 0 -> 8192 bytes .../tika/BoilerpipeExtractorRepository.java | 62 ++ .../org/apache/nutch/parse/tika/DOMBuilder.java | 794 +++++++++++++++++++ .../nutch/parse/tika/DOMContentUtils.java | 402 ++++++++++ .../nutch/parse/tika/HTMLMetaProcessor.java | 214 +++++ .../org/apache/nutch/parse/tika/TikaParser.java | 286 +++++++ .../parse/tika/XMLCharacterRecognizer.java | 112 +++ .../apache/nutch/parse/tika/package-info.java | 23 + .../apache/nutch/tika/TestDOMContentUtils.java | 337 ++++++++ .../org/apache/nutch/tika/TestFeedParser.java | 121 +++ .../apache/nutch/tika/TestImageMetadata.java | 67 ++ .../org/apache/nutch/tika/TestMSWordParser.java | 92 +++ .../org/apache/nutch/tika/TestOOParser.java | 107 +++ .../org/apache/nutch/tika/TestPdfParser.java | 73 ++ .../org/apache/nutch/tika/TestRTFParser.java | 81 ++ .../nutch/tika/TestRobotsMetaProcessor.java | 156 ++++ nutch-plugins/parse-zip/build.xml | 38 + nutch-plugins/parse-zip/ivy.xml | 41 + nutch-plugins/parse-zip/plugin.xml | 46 ++ nutch-plugins/parse-zip/pom.xml | 38 + nutch-plugins/parse-zip/sample/test.zip | Bin 0 -> 182 bytes .../org/apache/nutch/parse/zip/ZipParser.java | 144 ++++ .../nutch/parse/zip/ZipTextExtractor.java | 120 +++ .../apache/nutch/parse/zip/package-info.java | 22 + .../apache/nutch/parse/zip/TestZipParser.java | 71 ++ .../parsefilter-naivebayes/build-ivy.xml | 54 ++ nutch-plugins/parsefilter-naivebayes/build.xml | 22 + nutch-plugins/parsefilter-naivebayes/ivy.xml | 49 ++ nutch-plugins/parsefilter-naivebayes/plugin.xml | 56 ++ nutch-plugins/parsefilter-naivebayes/pom.xml | 38 + .../nutch/parsefilter/naivebayes/Classify.java | 120 +++ .../naivebayes/NaiveBayesParseFilter.java | 197 +++++ .../nutch/parsefilter/naivebayes/Train.java | 148 ++++ .../parsefilter/naivebayes/package-info.java | 28 + nutch-plugins/parsefilter-regex/build.xml | 27 + .../data/regex-parsefilter.txt | 10 + nutch-plugins/parsefilter-regex/ivy.xml | 37 + nutch-plugins/parsefilter-regex/plugin.xml | 42 + nutch-plugins/parsefilter-regex/pom.xml | 38 + .../parsefilter/regex/RegexParseFilter.java | 199 +++++ .../nutch/parsefilter/regex/package-info.java | 23 + .../parsefilter/regex/TestRegexParseFilter.java | 77 ++ nutch-plugins/plugin.dtd | 206 +++++ nutch-plugins/plugin/pom.xml | 38 + nutch-plugins/pom.xml | 94 ++- nutch-plugins/protocol-file/build.xml | 29 + nutch-plugins/protocol-file/ivy.xml | 41 + nutch-plugins/protocol-file/plugin.xml | 46 ++ nutch-plugins/protocol-file/pom.xml | 38 + .../protocol-file/sample/testprotocolfile.txt | 1 + .../sample/testprotocolfile_(encoded).txt | 1 + .../org/apache/nutch/protocol/file/File.java | 228 ++++++ .../apache/nutch/protocol/file/FileError.java | 36 + .../nutch/protocol/file/FileException.java | 40 + .../nutch/protocol/file/FileResponse.java | 317 ++++++++ .../org/apache/nutch/protocol/file/package.html | 5 + .../nutch/protocol/file/TestProtocolFile.java | 99 +++ nutch-plugins/protocol-ftp/build.xml | 22 + nutch-plugins/protocol-ftp/ivy.xml | 42 + nutch-plugins/protocol-ftp/plugin.xml | 46 ++ nutch-plugins/protocol-ftp/pom.xml | 38 + .../org/apache/nutch/protocol/ftp/Client.java | 595 ++++++++++++++ .../java/org/apache/nutch/protocol/ftp/Ftp.java | 267 +++++++ .../org/apache/nutch/protocol/ftp/FtpError.java | 36 + .../apache/nutch/protocol/ftp/FtpException.java | 46 ++ .../ftp/FtpExceptionBadSystResponse.java | 29 + .../FtpExceptionCanNotHaveDataConnection.java | 29 + ...ExceptionControlClosedByForcedDataClose.java | 30 + .../ftp/FtpExceptionUnknownForcedDataClose.java | 30 + .../apache/nutch/protocol/ftp/FtpResponse.java | 521 ++++++++++++ .../nutch/protocol/ftp/FtpRobotRulesParser.java | 121 +++ .../protocol/ftp/PrintCommandListener.java | 71 ++ .../org/apache/nutch/protocol/ftp/package.html | 5 + nutch-plugins/protocol-htmlunit/build.xml | 37 + nutch-plugins/protocol-htmlunit/ivy.xml | 38 + nutch-plugins/protocol-htmlunit/plugin.xml | 51 ++ nutch-plugins/protocol-htmlunit/pom.xml | 51 ++ .../apache/nutch/protocol/htmlunit/Http.java | 63 ++ .../nutch/protocol/htmlunit/HttpResponse.java | 573 +++++++++++++ .../apache/nutch/protocol/htmlunit/package.html | 21 + nutch-plugins/protocol-http/build.xml | 50 ++ nutch-plugins/protocol-http/ivy.xml | 41 + nutch-plugins/protocol-http/jsp/basic-http.jsp | 44 + nutch-plugins/protocol-http/jsp/brokenpage.jsp | 47 ++ nutch-plugins/protocol-http/jsp/redirect301.jsp | 49 ++ nutch-plugins/protocol-http/jsp/redirect302.jsp | 49 ++ nutch-plugins/protocol-http/plugin.xml | 51 ++ nutch-plugins/protocol-http/pom.xml | 45 ++ .../org/apache/nutch/protocol/http/Http.java | 73 ++ .../nutch/protocol/http/HttpResponse.java | 558 +++++++++++++ .../org/apache/nutch/protocol/http/package.html | 5 + .../src/test/conf/nutch-site-test.xml | 52 ++ .../nutch/protocol/http/TestProtocolHttp.java | 140 ++++ nutch-plugins/protocol-httpclient/build.xml | 45 ++ nutch-plugins/protocol-httpclient/ivy.xml | 42 + nutch-plugins/protocol-httpclient/jsp/basic.jsp | 74 ++ .../protocol-httpclient/jsp/cookies.jsp | 63 ++ .../protocol-httpclient/jsp/digest.jsp | 68 ++ .../protocol-httpclient/jsp/noauth.jsp | 36 + nutch-plugins/protocol-httpclient/jsp/ntlm.jsp | 89 +++ nutch-plugins/protocol-httpclient/plugin.xml | 58 ++ nutch-plugins/protocol-httpclient/pom.xml | 50 ++ .../DummySSLProtocolSocketFactory.java | 163 ++++ .../httpclient/DummyX509TrustManager.java | 92 +++ .../apache/nutch/protocol/httpclient/Http.java | 572 +++++++++++++ .../protocol/httpclient/HttpAuthentication.java | 45 ++ .../httpclient/HttpAuthenticationException.java | 71 ++ .../httpclient/HttpAuthenticationFactory.java | 98 +++ .../httpclient/HttpBasicAuthentication.java | 199 +++++ .../httpclient/HttpFormAuthConfigurer.java | 106 +++ .../httpclient/HttpFormAuthentication.java | 223 ++++++ .../nutch/protocol/httpclient/HttpResponse.java | 216 +++++ .../nutch/protocol/httpclient/package.html | 9 + .../src/test/conf/httpclient-auth-test.xml | 58 ++ .../src/test/conf/nutch-site-test.xml | 52 ++ .../httpclient/TestProtocolHttpClient.java | 217 +++++ .../protocol-interactiveselenium/README.md | 38 + .../protocol-interactiveselenium/build-ivy.xml | 54 ++ .../protocol-interactiveselenium/build.xml | 37 + .../protocol-interactiveselenium/ivy.xml | 42 + .../protocol-interactiveselenium/plugin.xml | 47 ++ .../protocol-interactiveselenium/pom.xml | 50 ++ .../protocol/interactiveselenium/Http.java | 59 ++ .../interactiveselenium/HttpResponse.java | 399 ++++++++++ .../DefalultMultiInteractionHandler.java | 53 ++ .../DefaultClickAllAjaxLinksHandler.java | 88 ++ .../handlers/DefaultHandler.java | 30 + .../handlers/InteractiveSeleniumHandler.java | 25 + .../protocol/interactiveselenium/package.html | 5 + nutch-plugins/protocol-selenium/README.md | 208 +++++ nutch-plugins/protocol-selenium/build-ivy.xml | 54 ++ nutch-plugins/protocol-selenium/build.xml | 36 + nutch-plugins/protocol-selenium/ivy.xml | 42 + nutch-plugins/protocol-selenium/plugin.xml | 47 ++ nutch-plugins/protocol-selenium/pom.xml | 50 ++ .../apache/nutch/protocol/selenium/Http.java | 59 ++ .../nutch/protocol/selenium/HttpResponse.java | 360 +++++++++ .../apache/nutch/protocol/selenium/package.html | 5 + nutch-plugins/scoring-depth/build.xml | 6 + nutch-plugins/scoring-depth/ivy.xml | 41 + nutch-plugins/scoring-depth/plugin.xml | 24 + nutch-plugins/scoring-depth/pom.xml | 38 + .../nutch/scoring/depth/DepthScoringFilter.java | 207 +++++ .../nutch/scoring/depth/package-info.java | 23 + nutch-plugins/scoring-link/build.xml | 27 + nutch-plugins/scoring-link/ivy.xml | 41 + nutch-plugins/scoring-link/plugin.xml | 39 + nutch-plugins/scoring-link/pom.xml | 38 + .../scoring/link/LinkAnalysisScoringFilter.java | 95 +++ .../apache/nutch/scoring/link/package-info.java | 23 + nutch-plugins/scoring-opic/build.xml | 27 + nutch-plugins/scoring-opic/ivy.xml | 41 + nutch-plugins/scoring-opic/plugin.xml | 39 + nutch-plugins/scoring-opic/pom.xml | 38 + .../nutch/scoring/opic/OPICScoringFilter.java | 173 ++++ .../apache/nutch/scoring/opic/package-info.java | 23 + nutch-plugins/scoring-similarity/build-ivy.xml | 54 ++ nutch-plugins/scoring-similarity/build.xml | 27 + nutch-plugins/scoring-similarity/ivy.xml | 42 + nutch-plugins/scoring-similarity/plugin.xml | 45 ++ nutch-plugins/scoring-similarity/pom.xml | 45 ++ .../scoring/similarity/SimilarityModel.java | 38 + .../similarity/SimilarityScoringFilter.java | 70 ++ .../similarity/cosine/CosineSimilarity.java | 84 ++ .../scoring/similarity/cosine/DocVector.java | 57 ++ .../nutch/scoring/similarity/cosine/Model.java | 190 +++++ .../scoring/similarity/cosine/package-info.java | 7 + .../similarity/util/LuceneAnalyzerUtil.java | 93 +++ .../similarity/util/LuceneTokenizer.java | 166 ++++ .../scoring/similarity/util/package-info.java | 24 + nutch-plugins/subcollection/README.txt | 10 + nutch-plugins/subcollection/build.xml | 22 + nutch-plugins/subcollection/ivy.xml | 41 + nutch-plugins/subcollection/plugin.xml | 41 + nutch-plugins/subcollection/pom.xml | 38 + .../nutch/collection/CollectionManager.java | 240 ++++++ .../apache/nutch/collection/Subcollection.java | 259 ++++++ .../org/apache/nutch/collection/package.html | 36 + .../SubcollectionIndexingFilter.java | 101 +++ .../indexer/subcollection/package-info.java | 25 + .../nutch/collection/TestSubcollection.java | 112 +++ nutch-plugins/tld/build.xml | 22 + nutch-plugins/tld/ivy.xml | 41 + nutch-plugins/tld/plugin.xml | 51 ++ nutch-plugins/tld/pom.xml | 38 + .../nutch/indexer/tld/TLDIndexingFilter.java | 69 ++ .../org/apache/nutch/indexer/tld/package.html | 5 + .../nutch/scoring/tld/TLDScoringFilter.java | 114 +++ .../org/apache/nutch/scoring/tld/package.html | 5 + nutch-plugins/urlfilter-automaton/build.xml | 51 ++ nutch-plugins/urlfilter-automaton/ivy.xml | 42 + nutch-plugins/urlfilter-automaton/plugin.xml | 43 + nutch-plugins/urlfilter-automaton/pom.xml | 50 ++ .../urlfilter-automaton/sample/Benchmarks.rules | 26 + .../urlfilter-automaton/sample/Benchmarks.urls | 297 +++++++ .../sample/IntranetCrawling.rules | 24 + .../sample/IntranetCrawling.urls | 8 + .../sample/WholeWebCrawling.rules | 19 + .../sample/WholeWebCrawling.urls | 11 + .../urlfilter/automaton/AutomatonURLFilter.java | 116 +++ .../nutch/urlfilter/automaton/package.html | 9 + .../automaton/TestAutomatonURLFilter.java | 56 ++ nutch-plugins/urlfilter-domain/build.xml | 28 + nutch-plugins/urlfilter-domain/data/hosts.txt | 5 + nutch-plugins/urlfilter-domain/ivy.xml | 41 + nutch-plugins/urlfilter-domain/plugin.xml | 43 + nutch-plugins/urlfilter-domain/pom.xml | 38 + .../nutch/urlfilter/domain/DomainURLFilter.java | 212 +++++ .../nutch/urlfilter/domain/package-info.java | 25 + .../urlfilter/domain/TestDomainURLFilter.java | 67 ++ .../urlfilter-domainblacklist/build.xml | 28 + .../urlfilter-domainblacklist/data/hosts.txt | 5 + nutch-plugins/urlfilter-domainblacklist/ivy.xml | 41 + .../urlfilter-domainblacklist/plugin.xml | 43 + nutch-plugins/urlfilter-domainblacklist/pom.xml | 38 + .../DomainBlacklistURLFilter.java | 210 +++++ .../urlfilter/domainblacklist/package-info.java | 24 + .../TestDomainBlacklistURLFilter.java | 49 ++ nutch-plugins/urlfilter-ignoreexempt/README.md | 43 + nutch-plugins/urlfilter-ignoreexempt/build.xml | 55 ++ .../urlfilter-ignoreexempt/data/.donotdelete | 0 nutch-plugins/urlfilter-ignoreexempt/ivy.xml | 41 + nutch-plugins/urlfilter-ignoreexempt/plugin.xml | 45 ++ nutch-plugins/urlfilter-ignoreexempt/pom.xml | 45 ++ .../ignoreexempt/ExemptionUrlFilter.java | 101 +++ .../urlfilter/ignoreexempt/package-info.java | 24 + nutch-plugins/urlfilter-prefix/build.xml | 22 + nutch-plugins/urlfilter-prefix/ivy.xml | 41 + nutch-plugins/urlfilter-prefix/plugin.xml | 47 ++ nutch-plugins/urlfilter-prefix/pom.xml | 38 + .../nutch/urlfilter/prefix/PrefixURLFilter.java | 178 +++++ .../apache/nutch/urlfilter/prefix/package.html | 5 + .../urlfilter/prefix/TestPrefixURLFilter.java | 79 ++ nutch-plugins/urlfilter-regex/build.xml | 51 ++ nutch-plugins/urlfilter-regex/ivy.xml | 41 + nutch-plugins/urlfilter-regex/plugin.xml | 48 ++ nutch-plugins/urlfilter-regex/pom.xml | 46 ++ .../urlfilter-regex/sample/Benchmarks.rules | 26 + .../urlfilter-regex/sample/Benchmarks.urls | 297 +++++++ .../sample/IntranetCrawling.rules | 27 + .../sample/IntranetCrawling.urls | 8 + .../sample/WholeWebCrawling.rules | 22 + .../sample/WholeWebCrawling.urls | 11 + .../urlfilter-regex/sample/nutch1838.rules | 12 + .../urlfilter-regex/sample/nutch1838.urls | 3 + .../nutch/urlfilter/regex/RegexURLFilter.java | 111 +++ .../apache/nutch/urlfilter/regex/package.html | 5 + .../urlfilter/regex/TestRegexURLFilter.java | 61 ++ nutch-plugins/urlfilter-suffix/build.xml | 22 + nutch-plugins/urlfilter-suffix/ivy.xml | 41 + nutch-plugins/urlfilter-suffix/plugin.xml | 47 ++ nutch-plugins/urlfilter-suffix/pom.xml | 38 + .../nutch/urlfilter/suffix/SuffixURLFilter.java | 331 ++++++++ .../nutch/urlfilter/suffix/package-info.java | 23 + .../urlfilter/suffix/TestSuffixURLFilter.java | 123 +++ nutch-plugins/urlfilter-validator/build.xml | 22 + nutch-plugins/urlfilter-validator/ivy.xml | 41 + nutch-plugins/urlfilter-validator/plugin.xml | 41 + nutch-plugins/urlfilter-validator/pom.xml | 38 + .../nutch/urlfilter/validator/UrlValidator.java | 386 +++++++++ .../nutch/urlfilter/validator/package.html | 9 + .../urlfilter/validator/TestUrlValidator.java | 79 ++ nutch-plugins/urlmeta/build.xml | 22 + nutch-plugins/urlmeta/ivy.xml | 41 + nutch-plugins/urlmeta/plugin.xml | 47 ++ nutch-plugins/urlmeta/pom.xml | 38 + .../indexer/urlmeta/URLMetaIndexingFilter.java | 118 +++ .../apache/nutch/indexer/urlmeta/package.html | 12 + .../scoring/urlmeta/URLMetaScoringFilter.java | 175 ++++ .../apache/nutch/scoring/urlmeta/package.html | 11 + nutch-plugins/urlnormalizer-ajax/build.xml | 22 + nutch-plugins/urlnormalizer-ajax/ivy.xml | 41 + nutch-plugins/urlnormalizer-ajax/plugin.xml | 41 + nutch-plugins/urlnormalizer-ajax/pom.xml | 38 + .../urlnormalizer/ajax/AjaxURLNormalizer.java | 236 ++++++ .../ajax/TestAjaxURLNormalizer.java | 67 ++ nutch-plugins/urlnormalizer-basic/build.xml | 22 + nutch-plugins/urlnormalizer-basic/ivy.xml | 41 + nutch-plugins/urlnormalizer-basic/plugin.xml | 41 + nutch-plugins/urlnormalizer-basic/pom.xml | 38 + .../urlnormalizer/basic/BasicURLNormalizer.java | 290 +++++++ .../net/urlnormalizer/basic/package-info.java | 23 + .../basic/TestBasicURLNormalizer.java | 175 ++++ nutch-plugins/urlnormalizer-host/build.xml | 27 + nutch-plugins/urlnormalizer-host/data/hosts.txt | 8 + nutch-plugins/urlnormalizer-host/ivy.xml | 41 + nutch-plugins/urlnormalizer-host/plugin.xml | 43 + nutch-plugins/urlnormalizer-host/pom.xml | 38 + .../urlnormalizer/host/HostURLNormalizer.java | 198 +++++ .../net/urlnormalizer/host/package-info.java | 23 + .../host/TestHostURLNormalizer.java | 57 ++ nutch-plugins/urlnormalizer-pass/build.xml | 22 + nutch-plugins/urlnormalizer-pass/ivy.xml | 41 + nutch-plugins/urlnormalizer-pass/plugin.xml | 41 + nutch-plugins/urlnormalizer-pass/pom.xml | 38 + .../urlnormalizer/pass/PassURLNormalizer.java | 49 ++ .../net/urlnormalizer/pass/package-info.java | 23 + .../pass/TestPassURLNormalizer.java | 45 ++ nutch-plugins/urlnormalizer-protocol/build.xml | 27 + .../urlnormalizer-protocol/data/protocols.txt | 7 + nutch-plugins/urlnormalizer-protocol/ivy.xml | 41 + nutch-plugins/urlnormalizer-protocol/plugin.xml | 43 + nutch-plugins/urlnormalizer-protocol/pom.xml | 38 + .../protocol/ProtocolURLNormalizer.java | 190 +++++ .../protocol/TestProtocolURLNormalizer.java | 55 ++ .../urlnormalizer-querystring/build.xml | 22 + nutch-plugins/urlnormalizer-querystring/ivy.xml | 41 + .../urlnormalizer-querystring/plugin.xml | 42 + nutch-plugins/urlnormalizer-querystring/pom.xml | 38 + .../querystring/QuerystringURLNormalizer.java | 91 +++ .../urlnormalizer/querystring/package-info.java | 23 + .../TestQuerystringURLNormalizer.java | 49 ++ nutch-plugins/urlnormalizer-regex/build.xml | 34 + nutch-plugins/urlnormalizer-regex/ivy.xml | 41 + nutch-plugins/urlnormalizer-regex/plugin.xml | 41 + nutch-plugins/urlnormalizer-regex/pom.xml | 38 + .../sample/regex-normalize-default.test | 84 ++ .../sample/regex-normalize-default.xml | 66 ++ .../sample/regex-normalize-scope1.test | 8 + .../sample/regex-normalize-scope1.xml | 21 + .../urlnormalizer/regex/RegexURLNormalizer.java | 324 ++++++++ .../net/urlnormalizer/regex/package-info.java | 23 + .../regex/TestRegexURLNormalizer.java | 186 +++++ nutch-plugins/urlnormalizer-slash/build.xml | 27 + .../urlnormalizer-slash/data/slashes.txt | 7 + nutch-plugins/urlnormalizer-slash/ivy.xml | 41 + nutch-plugins/urlnormalizer-slash/plugin.xml | 43 + nutch-plugins/urlnormalizer-slash/pom.xml | 38 + .../urlnormalizer/slash/SlashURLNormalizer.java | 224 ++++++ .../slash/TestSlashURLNormalizer.java | 73 ++ pom.xml | 22 +- src/plugin/build-plugin.xml | 255 ------ src/plugin/build.xml | 213 ----- src/plugin/creativecommons/README.txt | 1 - src/plugin/creativecommons/build.xml | 28 - .../creativecommons/conf/crawl-urlfilter.txt | 18 - src/plugin/creativecommons/conf/nutch-site.xml | 50 -- src/plugin/creativecommons/data/anchor.html | 9 - src/plugin/creativecommons/data/rdf.html | 35 - src/plugin/creativecommons/data/rel.html | 6 - src/plugin/creativecommons/ivy.xml | 41 - src/plugin/creativecommons/plugin.xml | 48 -- .../creativecommons/nutch/CCIndexingFilter.java | 124 --- .../creativecommons/nutch/CCParseFilter.java | 300 ------- .../java/org/creativecommons/nutch/package.html | 5 - .../nutch/TestCCParseFilter.java | 73 -- src/plugin/feed/build.xml | 45 -- src/plugin/feed/ivy.xml | 43 - src/plugin/feed/plugin.xml | 49 -- src/plugin/feed/sample/rsstest.rss | 36 - .../nutch/indexer/feed/FeedIndexingFilter.java | 129 --- .../apache/nutch/indexer/feed/package-info.java | 22 - .../org/apache/nutch/parse/feed/FeedParser.java | 374 --------- .../apache/nutch/parse/feed/package-info.java | 22 - .../apache/nutch/parse/feed/TestFeedParser.java | 124 --- src/plugin/headings/build.xml | 22 - src/plugin/headings/ivy.xml | 41 - src/plugin/headings/plugin.xml | 45 -- .../parse/headings/HeadingsParseFilter.java | 124 --- .../nutch/parse/headings/package-info.java | 22 - src/plugin/index-anchor/build.xml | 22 - src/plugin/index-anchor/ivy.xml | 41 - src/plugin/index-anchor/plugin.xml | 38 - .../indexer/anchor/AnchorIndexingFilter.java | 107 --- .../apache/nutch/indexer/anchor/package.html | 5 - .../anchor/TestAnchorIndexingFilter.java | 67 -- src/plugin/index-basic/build.xml | 22 - src/plugin/index-basic/ivy.xml | 41 - src/plugin/index-basic/plugin.xml | 42 - .../indexer/basic/BasicIndexingFilter.java | 158 ---- .../org/apache/nutch/indexer/basic/package.html | 5 - .../indexer/basic/TestBasicIndexingFilter.java | 99 --- src/plugin/index-geoip/build-ivy.xml | 54 -- src/plugin/index-geoip/build.xml | 27 - src/plugin/index-geoip/ivy.xml | 46 -- src/plugin/index-geoip/plugin.xml | 51 -- .../indexer/geoip/GeoIPDocumentCreator.java | 210 ----- .../indexer/geoip/GeoIPIndexingFilter.java | 241 ------ .../nutch/indexer/geoip/package-info.java | 28 - src/plugin/index-links/build.xml | 22 - src/plugin/index-links/ivy.xml | 41 - src/plugin/index-links/plugin.xml | 41 - .../indexer/links/LinksIndexingFilter.java | 167 ---- .../indexer/links/TestLinksIndexingFilter.java | 218 ----- .../org/apache/nutch/parse/TestOutlinks.java | 54 -- src/plugin/index-metadata/build.xml | 22 - src/plugin/index-metadata/ivy.xml | 41 - src/plugin/index-metadata/plugin.xml | 42 - .../nutch/indexer/metadata/MetadataIndexer.java | 104 --- .../nutch/indexer/metadata/package-info.java | 23 - src/plugin/index-more/build.xml | 22 - src/plugin/index-more/ivy.xml | 41 - src/plugin/index-more/plugin.xml | 42 - .../nutch/indexer/more/MoreIndexingFilter.java | 344 -------- .../org/apache/nutch/indexer/more/package.html | 6 - .../indexer/more/TestMoreIndexingFilter.java | 123 --- src/plugin/index-replace/README.txt | 95 --- src/plugin/index-replace/build.xml | 55 -- src/plugin/index-replace/ivy.xml | 41 - src/plugin/index-replace/plugin.xml | 22 - .../index-replace/sample/testIndexReplace.html | 12 - .../nutch/indexer/replace/FieldReplacer.java | 196 ----- .../nutch/indexer/replace/ReplaceIndexer.java | 330 -------- .../nutch/indexer/replace/package-info.java | 22 - .../nutch/indexer/replace/TestIndexReplace.java | 456 ----------- src/plugin/index-static/build.xml | 22 - src/plugin/index-static/ivy.xml | 41 - src/plugin/index-static/plugin.xml | 42 - .../indexer/staticfield/StaticFieldIndexer.java | 143 ---- .../nutch/indexer/staticfield/package.html | 5 - .../staticfield/TestStaticFieldIndexerTest.java | 194 ----- src/plugin/indexer-cloudsearch/README.md | 58 -- src/plugin/indexer-cloudsearch/build.xml | 22 - .../indexer-cloudsearch/createCSDomain.sh | 22 - src/plugin/indexer-cloudsearch/ivy.xml | 41 - src/plugin/indexer-cloudsearch/plugin.xml | 50 -- .../cloudsearch/CloudSearchConstants.java | 27 - .../cloudsearch/CloudSearchIndexWriter.java | 382 --------- .../cloudsearch/CloudSearchUtils.java | 73 -- src/plugin/indexer-dummy/build.xml | 22 - src/plugin/indexer-dummy/ivy.xml | 41 - src/plugin/indexer-dummy/plugin.xml | 38 - .../indexwriter/dummy/DummyIndexWriter.java | 103 --- .../nutch/indexwriter/dummy/package-info.java | 23 - src/plugin/indexer-elastic/build-ivy.xml | 54 -- src/plugin/indexer-elastic/build.xml | 22 - src/plugin/indexer-elastic/howto_upgrade_es.txt | 6 - src/plugin/indexer-elastic/ivy.xml | 43 - src/plugin/indexer-elastic/plugin.xml | 71 -- .../indexwriter/elastic/ElasticConstants.java | 28 - .../indexwriter/elastic/ElasticIndexWriter.java | 279 ------- .../nutch/indexwriter/elastic/package-info.java | 22 - src/plugin/indexer-solr/build-ivy.xml | 54 -- src/plugin/indexer-solr/build.xml | 22 - src/plugin/indexer-solr/ivy.xml | 44 - src/plugin/indexer-solr/plugin.xml | 48 -- .../nutch/indexwriter/solr/SolrConstants.java | 56 -- .../nutch/indexwriter/solr/SolrIndexWriter.java | 277 ------- .../indexwriter/solr/SolrMappingReader.java | 147 ---- .../nutch/indexwriter/solr/SolrUtils.java | 97 --- .../nutch/indexwriter/solr/package-info.java | 22 - src/plugin/language-identifier/build.xml | 38 - src/plugin/language-identifier/ivy.xml | 41 - src/plugin/language-identifier/plugin.xml | 49 -- .../nutch/analysis/lang/HTMLLanguageParser.java | 320 -------- .../analysis/lang/LanguageIndexingFilter.java | 89 --- .../nutch/analysis/lang/langmappings.properties | 188 ----- .../org/apache/nutch/analysis/lang/package.html | 6 - .../analysis/lang/TestHTMLLanguageParser.java | 149 ---- .../test/org/apache/nutch/analysis/lang/da.test | 108 --- .../test/org/apache/nutch/analysis/lang/de.test | 104 --- .../test/org/apache/nutch/analysis/lang/el.test | 109 --- .../test/org/apache/nutch/analysis/lang/en.test | 105 --- .../test/org/apache/nutch/analysis/lang/es.test | 107 --- .../test/org/apache/nutch/analysis/lang/fi.test | 106 --- .../test/org/apache/nutch/analysis/lang/fr.test | 105 --- .../test/org/apache/nutch/analysis/lang/it.test | 109 --- .../test/org/apache/nutch/analysis/lang/nl.test | 105 --- .../test/org/apache/nutch/analysis/lang/pt.test | 105 --- .../test/org/apache/nutch/analysis/lang/sv.test | 108 --- .../nutch/analysis/lang/test-referencial.txt | 10 - src/plugin/lib-htmlunit/build-ivy.xml | 54 -- src/plugin/lib-htmlunit/build.xml | 28 - src/plugin/lib-htmlunit/ivy.xml | 52 -- src/plugin/lib-htmlunit/plugin.xml | 166 ---- .../protocol/htmlunit/HtmlUnitWebDriver.java | 189 ----- .../htmlunit/HtmlUnitWebWindowListener.java | 53 -- src/plugin/lib-http/build.xml | 22 - src/plugin/lib-http/ivy.xml | 41 - src/plugin/lib-http/plugin.xml | 33 - .../protocol/http/api/BlockedException.java | 26 - .../nutch/protocol/http/api/HttpBase.java | 587 -------------- .../nutch/protocol/http/api/HttpException.java | 40 - .../protocol/http/api/HttpRobotRulesParser.java | 167 ---- .../apache/nutch/protocol/http/api/package.html | 6 - .../protocol/http/api/TestRobotRulesParser.java | 123 --- src/plugin/lib-nekohtml/build.xml | 30 - src/plugin/lib-nekohtml/ivy.xml | 42 - src/plugin/lib-nekohtml/plugin.xml | 38 - src/plugin/lib-regex-filter/build.xml | 22 - src/plugin/lib-regex-filter/ivy.xml | 41 - src/plugin/lib-regex-filter/plugin.xml | 33 - .../apache/nutch/urlfilter/api/RegexRule.java | 102 --- .../nutch/urlfilter/api/RegexURLFilterBase.java | 315 -------- .../nutch/urlfilter/api/package-info.java | 23 - .../urlfilter/api/RegexURLFilterBaseTest.java | 134 ---- src/plugin/lib-selenium/build-ivy.xml | 54 -- src/plugin/lib-selenium/build.xml | 28 - .../lib-selenium/howto_upgrade_selenium.txt | 15 - src/plugin/lib-selenium/ivy.xml | 52 -- src/plugin/lib-selenium/plugin.xml | 175 ---- .../nutch/protocol/selenium/HttpWebClient.java | 236 ------ src/plugin/lib-xml/build.xml | 36 - src/plugin/lib-xml/ivy.xml | 44 - src/plugin/lib-xml/plugin.xml | 65 -- src/plugin/microformats-reltag/build.xml | 27 - src/plugin/microformats-reltag/ivy.xml | 41 - src/plugin/microformats-reltag/plugin.xml | 49 -- .../reltag/RelTagIndexingFilter.java | 77 -- .../nutch/microformats/reltag/RelTagParser.java | 148 ---- .../nutch/microformats/reltag/package.html | 8 - src/plugin/mimetype-filter/build.xml | 28 - src/plugin/mimetype-filter/ivy.xml | 41 - src/plugin/mimetype-filter/plugin.xml | 37 - .../mimetype-filter/sample/allow-images.txt | 34 - .../mimetype-filter/sample/block-html.txt | 34 - .../indexer/filter/MimeTypeIndexingFilter.java | 273 ------- .../filter/MimeTypeIndexingFilterTest.java | 114 --- src/plugin/nutch-extensionpoints/build.xml | 30 - src/plugin/nutch-extensionpoints/ivy.xml | 41 - src/plugin/nutch-extensionpoints/plugin.xml | 67 -- src/plugin/parse-ext/build.xml | 32 - src/plugin/parse-ext/command | 24 - src/plugin/parse-ext/ivy.xml | 41 - src/plugin/parse-ext/plugin.xml | 60 -- .../org/apache/nutch/parse/ext/ExtParser.java | 183 ----- .../apache/nutch/parse/ext/package-info.java | 22 - .../apache/nutch/parse/ext/TestExtParser.java | 130 --- src/plugin/parse-html/build.xml | 40 - src/plugin/parse-html/ivy.xml | 42 - src/plugin/parse-html/plugin.xml | 48 -- .../org/apache/nutch/parse/html/DOMBuilder.java | 766 ------------------ .../nutch/parse/html/DOMContentUtils.java | 400 ---------- .../nutch/parse/html/HTMLMetaProcessor.java | 214 ----- .../org/apache/nutch/parse/html/HtmlParser.java | 352 -------- .../parse/html/XMLCharacterRecognizer.java | 112 --- .../org/apache/nutch/parse/html/package.html | 5 - .../nutch/parse/html/TestDOMContentUtils.java | 347 -------- .../apache/nutch/parse/html/TestHtmlParser.java | 122 --- .../parse/html/TestRobotsMetaProcessor.java | 155 ---- src/plugin/parse-js/build.xml | 22 - src/plugin/parse-js/ivy.xml | 41 - src/plugin/parse-js/plugin.xml | 53 -- .../apache/nutch/parse/js/JSParseFilter.java | 301 ------- .../org/apache/nutch/parse/js/package-info.java | 23 - src/plugin/parse-metatags/README.txt | 17 - src/plugin/parse-metatags/build.xml | 37 - src/plugin/parse-metatags/ivy.xml | 41 - src/plugin/parse-metatags/plugin.xml | 22 - .../parse-metatags/sample/testMetatags.html | 9 - .../sample/testMultivalueMetatags.html | 12 - .../nutch/parse/metatags/MetaTagsParser.java | 124 --- .../nutch/parse/metatags/package-info.java | 24 - .../nutch/parse/metatags/TestMetatagParser.java | 104 --- src/plugin/parse-replace/README.txt | 91 --- src/plugin/parse-replace/build.xml | 37 - src/plugin/parse-replace/ivy.xml | 41 - src/plugin/parse-replace/plugin.xml | 22 - .../parse-replace/sample/testParseReplace.html | 11 - .../nutch/parse/replace/ReplaceParser.java | 74 -- .../nutch/parse/replace/package-info.java | 22 - .../nutch/parse/replace/TestParseReplace.java | 68 -- src/plugin/parse-swf/build.xml | 38 - src/plugin/parse-swf/ivy.xml | 41 - src/plugin/parse-swf/lib/javaswf-LICENSE.txt | 33 - src/plugin/parse-swf/lib/javaswf.jar | Bin 125369 -> 0 bytes src/plugin/parse-swf/plugin.xml | 44 - src/plugin/parse-swf/sample/test1.swf | Bin 21054 -> 0 bytes src/plugin/parse-swf/sample/test1.txt | 60 -- src/plugin/parse-swf/sample/test2.swf | Bin 42534 -> 0 bytes src/plugin/parse-swf/sample/test2.txt | 5 - src/plugin/parse-swf/sample/test3.swf | Bin 51562 -> 0 bytes src/plugin/parse-swf/sample/test3.txt | 11 - .../org/apache/nutch/parse/swf/SWFParser.java | 685 ---------------- .../apache/nutch/parse/swf/package-info.java | 22 - .../apache/nutch/parse/swf/TestSWFParser.java | 94 --- src/plugin/parse-tika/build-ivy.xml | 54 -- src/plugin/parse-tika/build.xml | 55 -- src/plugin/parse-tika/howto_upgrade_tika.txt | 8 - src/plugin/parse-tika/ivy.xml | 46 -- src/plugin/parse-tika/plugin.xml | 136 ---- src/plugin/parse-tika/sample/encrypted.pdf | Bin 3431 -> 0 bytes src/plugin/parse-tika/sample/nutch.html | 519 ------------ src/plugin/parse-tika/sample/nutch_logo_tm.gif | Bin 2747 -> 0 bytes src/plugin/parse-tika/sample/ootest.odt | Bin 20753 -> 0 bytes src/plugin/parse-tika/sample/ootest.sxw | Bin 20125 -> 0 bytes src/plugin/parse-tika/sample/ootest.txt | 30 - src/plugin/parse-tika/sample/pdftest.pdf | 157 ---- src/plugin/parse-tika/sample/rsstest.rss | 37 - src/plugin/parse-tika/sample/test.rtf | 17 - src/plugin/parse-tika/sample/word97.doc | Bin 8192 -> 0 bytes .../tika/BoilerpipeExtractorRepository.java | 62 -- .../org/apache/nutch/parse/tika/DOMBuilder.java | 794 ------------------- .../nutch/parse/tika/DOMContentUtils.java | 402 ---------- .../nutch/parse/tika/HTMLMetaProcessor.java | 214 ----- .../org/apache/nutch/parse/tika/TikaParser.java | 286 ------- .../parse/tika/XMLCharacterRecognizer.java | 112 --- .../apache/nutch/parse/tika/package-info.java | 23 - .../apache/nutch/tika/TestDOMContentUtils.java | 337 -------- .../org/apache/nutch/tika/TestFeedParser.java | 121 --- .../apache/nutch/tika/TestImageMetadata.java | 67 -- .../org/apache/nutch/tika/TestMSWordParser.java | 92 --- .../org/apache/nutch/tika/TestOOParser.java | 107 --- .../org/apache/nutch/tika/TestPdfParser.java | 73 -- .../org/apache/nutch/tika/TestRTFParser.java | 81 -- .../nutch/tika/TestRobotsMetaProcessor.java | 156 ---- src/plugin/parse-zip/build.xml | 38 - src/plugin/parse-zip/ivy.xml | 41 - src/plugin/parse-zip/plugin.xml | 46 -- src/plugin/parse-zip/sample/test.zip | Bin 182 -> 0 bytes .../org/apache/nutch/parse/zip/ZipParser.java | 144 ---- .../nutch/parse/zip/ZipTextExtractor.java | 120 --- .../apache/nutch/parse/zip/package-info.java | 22 - .../apache/nutch/parse/zip/TestZipParser.java | 71 -- src/plugin/parsefilter-naivebayes/build-ivy.xml | 54 -- src/plugin/parsefilter-naivebayes/build.xml | 22 - src/plugin/parsefilter-naivebayes/ivy.xml | 49 -- src/plugin/parsefilter-naivebayes/plugin.xml | 56 -- .../nutch/parsefilter/naivebayes/Classify.java | 120 --- .../naivebayes/NaiveBayesParseFilter.java | 197 ----- .../nutch/parsefilter/naivebayes/Train.java | 148 ---- .../parsefilter/naivebayes/package-info.java | 28 - src/plugin/parsefilter-regex/build.xml | 27 - .../data/regex-parsefilter.txt | 10 - src/plugin/parsefilter-regex/ivy.xml | 37 - src/plugin/parsefilter-regex/plugin.xml | 42 - .../parsefilter/regex/RegexParseFilter.java | 199 ----- .../nutch/parsefilter/regex/package-info.java | 23 - .../parsefilter/regex/TestRegexParseFilter.java | 77 -- src/plugin/plugin.dtd | 206 ----- src/plugin/protocol-file/build.xml | 29 - src/plugin/protocol-file/ivy.xml | 41 - src/plugin/protocol-file/plugin.xml | 46 -- .../protocol-file/sample/testprotocolfile.txt | 1 - .../sample/testprotocolfile_(encoded).txt | 1 - .../org/apache/nutch/protocol/file/File.java | 228 ------ .../apache/nutch/protocol/file/FileError.java | 36 - .../nutch/protocol/file/FileException.java | 40 - .../nutch/protocol/file/FileResponse.java | 317 -------- .../org/apache/nutch/protocol/file/package.html | 5 - .../nutch/protocol/file/TestProtocolFile.java | 99 --- src/plugin/protocol-ftp/build.xml | 22 - src/plugin/protocol-ftp/ivy.xml | 42 - src/plugin/protocol-ftp/plugin.xml | 46 -- .../org/apache/nutch/protocol/ftp/Client.java | 595 -------------- .../java/org/apache/nutch/protocol/ftp/Ftp.java | 267 ------- .../org/apache/nutch/protocol/ftp/FtpError.java | 36 - .../apache/nutch/protocol/ftp/FtpException.java | 46 -- .../ftp/FtpExceptionBadSystResponse.java | 29 - .../FtpExceptionCanNotHaveDataConnection.java | 29 - ...ExceptionControlClosedByForcedDataClose.java | 30 - .../ftp/FtpExceptionUnknownForcedDataClose.java | 30 - .../apache/nutch/protocol/ftp/FtpResponse.java | 521 ------------ .../nutch/protocol/ftp/FtpRobotRulesParser.java | 121 --- .../protocol/ftp/PrintCommandListener.java | 71 -- .../org/apache/nutch/protocol/ftp/package.html | 5 - src/plugin/protocol-htmlunit/build.xml | 37 - src/plugin/protocol-htmlunit/ivy.xml | 38 - src/plugin/protocol-htmlunit/plugin.xml | 51 -- .../apache/nutch/protocol/htmlunit/Http.java | 63 -- .../nutch/protocol/htmlunit/HttpResponse.java | 573 ------------- .../apache/nutch/protocol/htmlunit/package.html | 21 - src/plugin/protocol-http/build.xml | 50 -- src/plugin/protocol-http/ivy.xml | 41 - src/plugin/protocol-http/jsp/basic-http.jsp | 44 - src/plugin/protocol-http/jsp/brokenpage.jsp | 47 -- src/plugin/protocol-http/jsp/redirect301.jsp | 49 -- src/plugin/protocol-http/jsp/redirect302.jsp | 49 -- src/plugin/protocol-http/plugin.xml | 51 -- .../org/apache/nutch/protocol/http/Http.java | 73 -- .../nutch/protocol/http/HttpResponse.java | 558 ------------- .../org/apache/nutch/protocol/http/package.html | 5 - .../src/test/conf/nutch-site-test.xml | 52 -- .../nutch/protocol/http/TestProtocolHttp.java | 140 ---- src/plugin/protocol-httpclient/build.xml | 45 -- src/plugin/protocol-httpclient/ivy.xml | 42 - src/plugin/protocol-httpclient/jsp/basic.jsp | 74 -- src/plugin/protocol-httpclient/jsp/cookies.jsp | 63 -- src/plugin/protocol-httpclient/jsp/digest.jsp | 68 -- src/plugin/protocol-httpclient/jsp/noauth.jsp | 36 - src/plugin/protocol-httpclient/jsp/ntlm.jsp | 89 --- src/plugin/protocol-httpclient/plugin.xml | 58 -- .../DummySSLProtocolSocketFactory.java | 163 ---- .../httpclient/DummyX509TrustManager.java | 92 --- .../apache/nutch/protocol/httpclient/Http.java | 572 ------------- .../protocol/httpclient/HttpAuthentication.java | 45 -- .../httpclient/HttpAuthenticationException.java | 71 -- .../httpclient/HttpAuthenticationFactory.java | 98 --- .../httpclient/HttpBasicAuthentication.java | 199 ----- .../httpclient/HttpFormAuthConfigurer.java | 106 --- .../httpclient/HttpFormAuthentication.java | 223 ------ .../nutch/protocol/httpclient/HttpResponse.java | 216 ----- .../nutch/protocol/httpclient/package.html | 9 - .../src/test/conf/httpclient-auth-test.xml | 58 -- .../src/test/conf/nutch-site-test.xml | 52 -- .../httpclient/TestProtocolHttpClient.java | 217 ----- .../protocol-interactiveselenium/README.md | 38 - .../protocol-interactiveselenium/build-ivy.xml | 54 -- .../protocol-interactiveselenium/build.xml | 37 - src/plugin/protocol-interactiveselenium/ivy.xml | 42 - .../protocol-interactiveselenium/plugin.xml | 47 -- .../protocol/interactiveselenium/Http.java | 59 -- .../interactiveselenium/HttpResponse.java | 399 ---------- .../DefalultMultiInteractionHandler.java | 53 -- .../DefaultClickAllAjaxLinksHandler.java | 88 -- .../handlers/DefaultHandler.java | 30 - .../handlers/InteractiveSeleniumHandler.java | 25 - .../protocol/interactiveselenium/package.html | 5 - src/plugin/protocol-selenium/README.md | 208 ----- src/plugin/protocol-selenium/build-ivy.xml | 54 -- src/plugin/protocol-selenium/build.xml | 36 - src/plugin/protocol-selenium/ivy.xml | 42 - src/plugin/protocol-selenium/plugin.xml | 47 -- .../apache/nutch/protocol/selenium/Http.java | 59 -- .../nutch/protocol/selenium/HttpResponse.java | 360 --------- .../apache/nutch/protocol/selenium/package.html | 5 - src/plugin/scoring-depth/build.xml | 6 - src/plugin/scoring-depth/ivy.xml | 41 - src/plugin/scoring-depth/plugin.xml | 24 - .../nutch/scoring/depth/DepthScoringFilter.java | 207 ----- .../nutch/scoring/depth/package-info.java | 23 - src/plugin/scoring-link/build.xml | 27 - src/plugin/scoring-link/ivy.xml | 41 - src/plugin/scoring-link/plugin.xml | 39 - .../scoring/link/LinkAnalysisScoringFilter.java | 95 --- .../apache/nutch/scoring/link/package-info.java | 23 - src/plugin/scoring-opic/build.xml | 27 - src/plugin/scoring-opic/ivy.xml | 41 - src/plugin/scoring-opic/plugin.xml | 39 - .../nutch/scoring/opic/OPICScoringFilter.java | 173 ---- .../apache/nutch/scoring/opic/package-info.java | 23 - src/plugin/scoring-similarity/build-ivy.xml | 54 -- src/plugin/scoring-similarity/build.xml | 27 - src/plugin/scoring-similarity/ivy.xml | 42 - src/plugin/scoring-similarity/plugin.xml | 45 -- .../scoring/similarity/SimilarityModel.java | 38 - .../similarity/SimilarityScoringFilter.java | 70 -- .../similarity/cosine/CosineSimilarity.java | 84 -- .../scoring/similarity/cosine/DocVector.java | 57 -- .../nutch/scoring/similarity/cosine/Model.java | 190 ----- .../scoring/similarity/cosine/package-info.java | 7 - .../similarity/util/LuceneAnalyzerUtil.java | 93 --- .../similarity/util/LuceneTokenizer.java | 166 ---- .../scoring/similarity/util/package-info.java | 24 - src/plugin/subcollection/README.txt | 10 - src/plugin/subcollection/build.xml | 22 - src/plugin/subcollection/ivy.xml | 41 - src/plugin/subcollection/plugin.xml | 41 - .../nutch/collection/CollectionManager.java | 240 ------ .../apache/nutch/collection/Subcollection.java | 259 ------ .../org/apache/nutch/collection/package.html | 36 - .../SubcollectionIndexingFilter.java | 101 --- .../indexer/subcollection/package-info.java | 25 - .../nutch/collection/TestSubcollection.java | 112 --- src/plugin/tld/build.xml | 22 - src/plugin/tld/ivy.xml | 41 - src/plugin/tld/plugin.xml | 51 -- .../nutch/indexer/tld/TLDIndexingFilter.java | 69 -- .../org/apache/nutch/indexer/tld/package.html | 5 - .../nutch/scoring/tld/TLDScoringFilter.java | 114 --- .../org/apache/nutch/scoring/tld/package.html | 5 - src/plugin/urlfilter-automaton/build.xml | 51 -- src/plugin/urlfilter-automaton/ivy.xml | 42 - src/plugin/urlfilter-automaton/plugin.xml | 43 - .../urlfilter-automaton/sample/Benchmarks.rules | 26 - .../urlfilter-automaton/sample/Benchmarks.urls | 297 ------- .../sample/IntranetCrawling.rules | 24 - .../sample/IntranetCrawling.urls | 8 - .../sample/WholeWebCrawling.rules | 19 - .../sample/WholeWebCrawling.urls | 11 - .../urlfilter/automaton/AutomatonURLFilter.java | 116 --- .../nutch/urlfilter/automaton/package.html | 9 - .../automaton/TestAutomatonURLFilter.java | 56 -- src/plugin/urlfilter-domain/build.xml | 28 - src/plugin/urlfilter-domain/data/hosts.txt | 5 - src/plugin/urlfilter-domain/ivy.xml | 41 - src/plugin/urlfilter-domain/plugin.xml | 43 - .../nutch/urlfilter/domain/DomainURLFilter.java | 212 ----- .../nutch/urlfilter/domain/package-info.java | 25 - .../urlfilter/domain/TestDomainURLFilter.java | 67 -- src/plugin/urlfilter-domainblacklist/build.xml | 28 - .../urlfilter-domainblacklist/data/hosts.txt | 5 - src/plugin/urlfilter-domainblacklist/ivy.xml | 41 - src/plugin/urlfilter-domainblacklist/plugin.xml | 43 - .../DomainBlacklistURLFilter.java | 210 ----- .../urlfilter/domainblacklist/package-info.java | 24 - .../TestDomainBlacklistURLFilter.java | 49 -- src/plugin/urlfilter-ignoreexempt/README.md | 43 - src/plugin/urlfilter-ignoreexempt/build.xml | 55 -- .../urlfilter-ignoreexempt/data/.donotdelete | 0 src/plugin/urlfilter-ignoreexempt/ivy.xml | 41 - src/plugin/urlfilter-ignoreexempt/plugin.xml | 45 -- .../ignoreexempt/ExemptionUrlFilter.java | 101 --- .../urlfilter/ignoreexempt/package-info.java | 24 - src/plugin/urlfilter-prefix/build.xml | 22 - src/plugin/urlfilter-prefix/ivy.xml | 41 - src/plugin/urlfilter-prefix/plugin.xml | 47 -- .../nutch/urlfilter/prefix/PrefixURLFilter.java | 178 ----- .../apache/nutch/urlfilter/prefix/package.html | 5 - .../urlfilter/prefix/TestPrefixURLFilter.java | 79 -- src/plugin/urlfilter-regex/build.xml | 51 -- src/plugin/urlfilter-regex/ivy.xml | 41 - src/plugin/urlfilter-regex/plugin.xml | 48 -- .../urlfilter-regex/sample/Benchmarks.rules | 26 - .../urlfilter-regex/sample/Benchmarks.urls | 297 ------- .../sample/IntranetCrawling.rules | 27 - .../sample/IntranetCrawling.urls | 8 - .../sample/WholeWebCrawling.rules | 22 - .../sample/WholeWebCrawling.urls | 11 - .../urlfilter-regex/sample/nutch1838.rules | 12 - .../urlfilter-regex/sample/nutch1838.urls | 3 - .../nutch/urlfilter/regex/RegexURLFilter.java | 111 --- .../apache/nutch/urlfilter/regex/package.html | 5 - .../urlfilter/regex/TestRegexURLFilter.java | 61 -- src/plugin/urlfilter-suffix/build.xml | 22 - src/plugin/urlfilter-suffix/ivy.xml | 41 - src/plugin/urlfilter-suffix/plugin.xml | 47 -- .../nutch/urlfilter/suffix/SuffixURLFilter.java | 331 -------- .../nutch/urlfilter/suffix/package-info.java | 23 - .../urlfilter/suffix/TestSuffixURLFilter.java | 123 --- src/plugin/urlfilter-validator/build.xml | 22 - src/plugin/urlfilter-validator/ivy.xml | 41 - src/plugin/urlfilter-validator/plugin.xml | 41 - .../nutch/urlfilter/validator/UrlValidator.java | 386 --------- .../nutch/urlfilter/validator/package.html | 9 - .../urlfilter/validator/TestUrlValidator.java | 79 -- src/plugin/urlmeta/build.xml | 22 - src/plugin/urlmeta/ivy.xml | 41 - src/plugin/urlmeta/plugin.xml | 47 -- .../indexer/urlmeta/URLMetaIndexingFilter.java | 118 --- .../apache/nutch/indexer/urlmeta/package.html | 12 - .../scoring/urlmeta/URLMetaScoringFilter.java | 175 ---- .../apache/nutch/scoring/urlmeta/package.html | 11 - src/plugin/urlnormalizer-ajax/build.xml | 22 - src/plugin/urlnormalizer-ajax/ivy.xml | 41 - src/plugin/urlnormalizer-ajax/plugin.xml | 41 - .../urlnormalizer/ajax/AjaxURLNormalizer.java | 236 ------ .../ajax/TestAjaxURLNormalizer.java | 67 -- src/plugin/urlnormalizer-basic/build.xml | 22 - src/plugin/urlnormalizer-basic/ivy.xml | 41 - src/plugin/urlnormalizer-basic/plugin.xml | 41 - .../urlnormalizer/basic/BasicURLNormalizer.java | 290 ------- .../net/urlnormalizer/basic/package-info.java | 23 - .../basic/TestBasicURLNormalizer.java | 175 ---- src/plugin/urlnormalizer-host/build.xml | 27 - src/plugin/urlnormalizer-host/data/hosts.txt | 8 - src/plugin/urlnormalizer-host/ivy.xml | 41 - src/plugin/urlnormalizer-host/plugin.xml | 43 - .../urlnormalizer/host/HostURLNormalizer.java | 198 ----- .../net/urlnormalizer/host/package-info.java | 23 - .../host/TestHostURLNormalizer.java | 57 -- src/plugin/urlnormalizer-pass/build.xml | 22 - src/plugin/urlnormalizer-pass/ivy.xml | 41 - src/plugin/urlnormalizer-pass/plugin.xml | 41 - .../urlnormalizer/pass/PassURLNormalizer.java | 49 -- .../net/urlnormalizer/pass/package-info.java | 23 - .../pass/TestPassURLNormalizer.java | 45 -- src/plugin/urlnormalizer-protocol/build.xml | 27 - .../urlnormalizer-protocol/data/protocols.txt | 7 - src/plugin/urlnormalizer-protocol/ivy.xml | 41 - src/plugin/urlnormalizer-protocol/plugin.xml | 43 - .../protocol/ProtocolURLNormalizer.java | 190 ----- .../protocol/TestProtocolURLNormalizer.java | 55 -- src/plugin/urlnormalizer-querystring/build.xml | 22 - src/plugin/urlnormalizer-querystring/ivy.xml | 41 - src/plugin/urlnormalizer-querystring/plugin.xml | 42 - .../querystring/QuerystringURLNormalizer.java | 91 --- .../urlnormalizer/querystring/package-info.java | 23 - .../TestQuerystringURLNormalizer.java | 49 -- src/plugin/urlnormalizer-regex/build.xml | 34 - src/plugin/urlnormalizer-regex/ivy.xml | 41 - src/plugin/urlnormalizer-regex/plugin.xml | 41 - .../sample/regex-normalize-default.test | 84 -- .../sample/regex-normalize-default.xml | 66 -- .../sample/regex-normalize-scope1.test | 8 - .../sample/regex-normalize-scope1.xml | 21 - .../urlnormalizer/regex/RegexURLNormalizer.java | 324 -------- .../net/urlnormalizer/regex/package-info.java | 23 - .../regex/TestRegexURLNormalizer.java | 186 ----- src/plugin/urlnormalizer-slash/build.xml | 27 - src/plugin/urlnormalizer-slash/data/slashes.txt | 7 - src/plugin/urlnormalizer-slash/ivy.xml | 41 - src/plugin/urlnormalizer-slash/plugin.xml | 43 - .../urlnormalizer/slash/SlashURLNormalizer.java | 224 ------ .../slash/TestSlashURLNormalizer.java | 73 -- .../fetch-test-site/dup_of_pagea.html | 11 - .../fetch-test-site/exception.html | 13 - src/testresources/fetch-test-site/index.html | 13 - .../fetch-test-site/nested_spider_trap.html | 23 - src/testresources/fetch-test-site/pagea.html | 11 - src/testresources/fetch-test-site/pageb.html | 11 - src/testresources/fetch-test-site/robots.txt | 0 src/testresources/test-mime-util/test.xlsx | Bin 3950 -> 0 bytes .../20150309101625/content/part-00000/.data.crc | Bin 124 -> 0 bytes .../content/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101625/content/part-00000/data | Bin 14452 -> 0 bytes .../20150309101625/content/part-00000/index | Bin 217 -> 0 bytes .../crawl_fetch/part-00000/.data.crc | Bin 12 -> 0 bytes .../crawl_fetch/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101625/crawl_fetch/part-00000/data | Bin 293 -> 0 bytes .../20150309101625/crawl_fetch/part-00000/index | Bin 217 -> 0 bytes .../crawl_generate/.part-00000.crc | Bin 12 -> 0 bytes .../20150309101625/crawl_generate/part-00000 | Bin 169 -> 0 bytes .../20150309101625/crawl_parse/.part-00000.crc | Bin 68 -> 0 bytes .../20150309101625/crawl_parse/part-00000 | Bin 7627 -> 0 bytes .../parse_data/part-00000/.data.crc | Bin 24 -> 0 bytes .../parse_data/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101625/parse_data/part-00000/data | Bin 1985 -> 0 bytes .../20150309101625/parse_data/part-00000/index | Bin 217 -> 0 bytes .../parse_text/part-00000/.data.crc | Bin 60 -> 0 bytes .../parse_text/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101625/parse_text/part-00000/data | Bin 6554 -> 0 bytes .../20150309101625/parse_text/part-00000/index | Bin 217 -> 0 bytes .../20150309101656/content/part-00000/.data.crc | Bin 3372 -> 0 bytes .../content/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101656/content/part-00000/data | Bin 430250 -> 0 bytes .../20150309101656/content/part-00000/index | Bin 220 -> 0 bytes .../crawl_fetch/part-00000/.data.crc | Bin 104 -> 0 bytes .../crawl_fetch/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101656/crawl_fetch/part-00000/data | Bin 12121 -> 0 bytes .../20150309101656/crawl_fetch/part-00000/index | Bin 220 -> 0 bytes .../crawl_generate/.part-00000.crc | Bin 52 -> 0 bytes .../20150309101656/crawl_generate/part-00000 | Bin 5590 -> 0 bytes .../20150309101656/crawl_parse/.part-00000.crc | Bin 1652 -> 0 bytes .../20150309101656/crawl_parse/part-00000 | Bin 210047 -> 0 bytes .../parse_data/part-00000/.data.crc | Bin 460 -> 0 bytes .../parse_data/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101656/parse_data/part-00000/data | Bin 57355 -> 0 bytes .../20150309101656/parse_data/part-00000/index | Bin 220 -> 0 bytes .../parse_text/part-00000/.data.crc | Bin 1260 -> 0 bytes .../parse_text/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101656/parse_text/part-00000/data | Bin 159920 -> 0 bytes .../20150309101656/parse_text/part-00000/index | Bin 220 -> 0 bytes 1253 files changed, 48889 insertions(+), 46080 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 5b3c687..7a70f9d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,9 @@ build/ runtime/ logs/ /bin/ + +*.class +target/ +nutch-core/target +nutch-plugins/target +nutch-plugins/*/target \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html b/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html new file mode 100644 index 0000000..6444c41 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/dup_of_pagea.html @@ -0,0 +1,11 @@ +<html> + <head> + <title>page a</title> + </head> +<body> +This is page a +<a href="index.html">home</a> +<hr> +Nutch fetcher test page +</body> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/exception.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/exception.html b/nutch-core/src/test/resources/fetch-test-site/exception.html new file mode 100644 index 0000000..e1192a1 --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/exception.html @@ -0,0 +1,13 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> +<HTML> +<HEAD> +<TITLE>Exception</TITLE> +<META http-equiv="Content-Type" content="text/html; charset=unicode"> +</HEAD> +<BODY> +!!Trying to parse this one will fail with a MalformedInputException!! + +Nutch fetcher test page. +</BODY> +</HTML> + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-core/src/test/resources/fetch-test-site/index.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/test/resources/fetch-test-site/index.html b/nutch-core/src/test/resources/fetch-test-site/index.html new file mode 100644 index 0000000..d73ff3f --- /dev/null +++ b/nutch-core/src/test/resources/fetch-test-site/index.html @@ -0,0 +1,13 @@ +<html> + <head> + <title>front page</title> + </head> +<body> +This is front page. +<a href="pagea.html">Page a</a> +<a href="pageb.html">Page b</a> +<a href="dup_of_pagea.html">dup of Page a</a> +<hr> +Nutch fetcher test page +</body> +</html> \ No newline at end of file
