NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/0bf453e5 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/0bf453e5 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/0bf453e5 Branch: refs/heads/NUTCH-2292 Commit: 0bf453e5754967541a0798585dbe115630679c5f Parents: 5943d11 Author: Thamme Gowda <[email protected]> Authored: Sat Jul 16 12:47:08 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Sat Jul 16 12:47:08 2016 -0700 ---------------------------------------------------------------------- .gitignore | 6 + bin/crawl | 281 ++++++ bin/nutch | 324 +++++++ nutch-core/pom.xml | 522 +++++++++++ .../nutch/crawl/AbstractFetchSchedule.java | 227 +++++ .../nutch/crawl/AdaptiveFetchSchedule.java | 203 +++++ .../java/org/apache/nutch/crawl/CrawlDatum.java | 572 ++++++++++++ .../java/org/apache/nutch/crawl/CrawlDb.java | 349 ++++++++ .../org/apache/nutch/crawl/CrawlDbFilter.java | 111 +++ .../org/apache/nutch/crawl/CrawlDbMerger.java | 216 +++++ .../org/apache/nutch/crawl/CrawlDbReader.java | 887 +++++++++++++++++++ .../org/apache/nutch/crawl/CrawlDbReducer.java | 339 +++++++ .../apache/nutch/crawl/DeduplicationJob.java | 389 ++++++++ .../nutch/crawl/DefaultFetchSchedule.java | 45 + .../org/apache/nutch/crawl/FetchSchedule.java | 208 +++++ .../nutch/crawl/FetchScheduleFactory.java | 53 ++ .../java/org/apache/nutch/crawl/Generator.java | 859 ++++++++++++++++++ .../java/org/apache/nutch/crawl/Injector.java | 510 +++++++++++ .../java/org/apache/nutch/crawl/Inlink.java | 83 ++ .../java/org/apache/nutch/crawl/Inlinks.java | 110 +++ .../java/org/apache/nutch/crawl/LinkDb.java | 428 +++++++++ .../org/apache/nutch/crawl/LinkDbFilter.java | 128 +++ .../org/apache/nutch/crawl/LinkDbMerger.java | 204 +++++ .../org/apache/nutch/crawl/LinkDbReader.java | 203 +++++ .../org/apache/nutch/crawl/MD5Signature.java | 39 + .../nutch/crawl/MimeAdaptiveFetchSchedule.java | 236 +++++ .../org/apache/nutch/crawl/NutchWritable.java | 66 ++ .../java/org/apache/nutch/crawl/Signature.java | 37 + .../apache/nutch/crawl/SignatureComparator.java | 57 ++ .../apache/nutch/crawl/SignatureFactory.java | 62 ++ .../apache/nutch/crawl/TextMD5Signature.java | 42 + .../nutch/crawl/TextProfileSignature.java | 199 +++++ .../org/apache/nutch/crawl/URLPartitioner.java | 96 ++ .../java/org/apache/nutch/crawl/package.html | 5 + .../org/apache/nutch/fetcher/FetchItem.java | 118 +++ .../apache/nutch/fetcher/FetchItemQueue.java | 139 +++ .../apache/nutch/fetcher/FetchItemQueues.java | 212 +++++ .../org/apache/nutch/fetcher/FetchNode.java | 59 ++ .../org/apache/nutch/fetcher/FetchNodeDb.java | 49 + .../java/org/apache/nutch/fetcher/Fetcher.java | 600 +++++++++++++ .../nutch/fetcher/FetcherOutputFormat.java | 123 +++ .../org/apache/nutch/fetcher/FetcherThread.java | 768 ++++++++++++++++ .../org/apache/nutch/fetcher/QueueFeeder.java | 104 +++ .../java/org/apache/nutch/fetcher/package.html | 5 + .../java/org/apache/nutch/hostdb/HostDatum.java | 324 +++++++ .../org/apache/nutch/hostdb/ReadHostDb.java | 240 +++++ .../org/apache/nutch/hostdb/ResolverThread.java | 121 +++ .../org/apache/nutch/hostdb/UpdateHostDb.java | 259 ++++++ .../apache/nutch/hostdb/UpdateHostDbMapper.java | 239 +++++ .../nutch/hostdb/UpdateHostDbReducer.java | 427 +++++++++ .../org/apache/nutch/indexer/CleaningJob.java | 210 +++++ .../org/apache/nutch/indexer/IndexWriter.java | 47 + .../org/apache/nutch/indexer/IndexWriters.java | 145 +++ .../apache/nutch/indexer/IndexerMapReduce.java | 422 +++++++++ .../nutch/indexer/IndexerOutputFormat.java | 57 ++ .../apache/nutch/indexer/IndexingException.java | 39 + .../apache/nutch/indexer/IndexingFilter.java | 61 ++ .../apache/nutch/indexer/IndexingFilters.java | 60 ++ .../nutch/indexer/IndexingFiltersChecker.java | 371 ++++++++ .../org/apache/nutch/indexer/IndexingJob.java | 358 ++++++++ .../org/apache/nutch/indexer/NutchDocument.java | 144 +++ .../org/apache/nutch/indexer/NutchField.java | 137 +++ .../apache/nutch/indexer/NutchIndexAction.java | 58 ++ .../java/org/apache/nutch/indexer/package.html | 10 + .../apache/nutch/metadata/CreativeCommons.java | 35 + .../org/apache/nutch/metadata/DublinCore.java | 161 ++++ .../java/org/apache/nutch/metadata/Feed.java | 38 + .../org/apache/nutch/metadata/HttpHeaders.java | 51 ++ .../org/apache/nutch/metadata/MetaWrapper.java | 120 +++ .../org/apache/nutch/metadata/Metadata.java | 280 ++++++ .../java/org/apache/nutch/metadata/Nutch.java | 98 ++ .../nutch/metadata/SpellCheckedMetadata.java | 150 ++++ .../java/org/apache/nutch/metadata/package.html | 6 + .../apache/nutch/net/URLExemptionFilter.java | 43 + .../apache/nutch/net/URLExemptionFilters.java | 64 ++ .../java/org/apache/nutch/net/URLFilter.java | 40 + .../org/apache/nutch/net/URLFilterChecker.java | 134 +++ .../apache/nutch/net/URLFilterException.java | 39 + .../java/org/apache/nutch/net/URLFilters.java | 44 + .../org/apache/nutch/net/URLNormalizer.java | 37 + .../apache/nutch/net/URLNormalizerChecker.java | 117 +++ .../org/apache/nutch/net/URLNormalizers.java | 325 +++++++ .../java/org/apache/nutch/net/package-info.java | 23 + .../nutch/net/protocols/HttpDateFormat.java | 124 +++ .../nutch/net/protocols/ProtocolException.java | 47 + .../apache/nutch/net/protocols/Response.java | 46 + .../nutch/net/protocols/package-info.java | 23 + .../org/apache/nutch/parse/HTMLMetaTags.java | 203 +++++ .../org/apache/nutch/parse/HtmlParseFilter.java | 45 + .../apache/nutch/parse/HtmlParseFilters.java | 62 ++ .../java/org/apache/nutch/parse/Outlink.java | 135 +++ .../apache/nutch/parse/OutlinkExtractor.java | 145 +++ .../main/java/org/apache/nutch/parse/Parse.java | 38 + .../org/apache/nutch/parse/ParseCallable.java | 37 + .../java/org/apache/nutch/parse/ParseData.java | 255 ++++++ .../org/apache/nutch/parse/ParseException.java | 39 + .../java/org/apache/nutch/parse/ParseImpl.java | 87 ++ .../apache/nutch/parse/ParseOutputFormat.java | 398 +++++++++ .../org/apache/nutch/parse/ParsePluginList.java | 71 ++ .../apache/nutch/parse/ParsePluginsReader.java | 278 ++++++ .../org/apache/nutch/parse/ParseResult.java | 178 ++++ .../org/apache/nutch/parse/ParseSegment.java | 309 +++++++ .../org/apache/nutch/parse/ParseStatus.java | 311 +++++++ .../java/org/apache/nutch/parse/ParseText.java | 119 +++ .../java/org/apache/nutch/parse/ParseUtil.java | 181 ++++ .../java/org/apache/nutch/parse/Parser.java | 58 ++ .../org/apache/nutch/parse/ParserChecker.java | 270 ++++++ .../org/apache/nutch/parse/ParserFactory.java | 428 +++++++++ .../org/apache/nutch/parse/ParserNotFound.java | 47 + .../org/apache/nutch/parse/package-info.java | 22 + .../plugin/CircularDependencyException.java | 36 + .../java/org/apache/nutch/plugin/Extension.java | 194 ++++ .../org/apache/nutch/plugin/ExtensionPoint.java | 123 +++ .../plugin/MissingDependencyException.java | 36 + .../java/org/apache/nutch/plugin/Pluggable.java | 31 + .../java/org/apache/nutch/plugin/Plugin.java | 95 ++ .../apache/nutch/plugin/PluginClassLoader.java | 80 ++ .../apache/nutch/plugin/PluginDescriptor.java | 363 ++++++++ .../nutch/plugin/PluginManifestParser.java | 303 +++++++ .../apache/nutch/plugin/PluginRepository.java | 523 +++++++++++ .../nutch/plugin/PluginRuntimeException.java | 37 + .../java/org/apache/nutch/plugin/package.html | 40 + .../java/org/apache/nutch/protocol/Content.java | 296 +++++++ .../org/apache/nutch/protocol/Protocol.java | 68 ++ .../nutch/protocol/ProtocolException.java | 39 + .../apache/nutch/protocol/ProtocolFactory.java | 119 +++ .../apache/nutch/protocol/ProtocolNotFound.java | 36 + .../apache/nutch/protocol/ProtocolOutput.java | 55 ++ .../apache/nutch/protocol/ProtocolStatus.java | 297 +++++++ .../apache/nutch/protocol/RobotRulesParser.java | 325 +++++++ .../org/apache/nutch/protocol/package-info.java | 23 + .../nutch/scoring/AbstractScoringFilter.java | 68 ++ .../org/apache/nutch/scoring/ScoringFilter.java | 213 +++++ .../nutch/scoring/ScoringFilterException.java | 43 + .../apache/nutch/scoring/ScoringFilters.java | 118 +++ .../org/apache/nutch/scoring/package-info.java | 22 + .../nutch/scoring/webgraph/LinkDatum.java | 140 +++ .../nutch/scoring/webgraph/LinkDumper.java | 433 +++++++++ .../apache/nutch/scoring/webgraph/LinkRank.java | 677 ++++++++++++++ .../org/apache/nutch/scoring/webgraph/Node.java | 102 +++ .../nutch/scoring/webgraph/NodeDumper.java | 433 +++++++++ .../nutch/scoring/webgraph/NodeReader.java | 136 +++ .../nutch/scoring/webgraph/ScoreUpdater.java | 253 ++++++ .../apache/nutch/scoring/webgraph/WebGraph.java | 783 ++++++++++++++++ .../nutch/scoring/webgraph/package-info.java | 24 + .../nutch/segment/ContentAsTextInputFormat.java | 104 +++ .../apache/nutch/segment/SegmentChecker.java | 136 +++ .../nutch/segment/SegmentMergeFilter.java | 47 + .../nutch/segment/SegmentMergeFilters.java | 84 ++ .../org/apache/nutch/segment/SegmentMerger.java | 793 +++++++++++++++++ .../org/apache/nutch/segment/SegmentPart.java | 113 +++ .../org/apache/nutch/segment/SegmentReader.java | 719 +++++++++++++++ .../org/apache/nutch/segment/package-info.java | 23 + .../org/apache/nutch/service/ConfManager.java | 39 + .../org/apache/nutch/service/JobManager.java | 44 + .../org/apache/nutch/service/NutchReader.java | 37 + .../org/apache/nutch/service/NutchServer.java | 224 +++++ .../nutch/service/impl/ConfManagerImpl.java | 132 +++ .../apache/nutch/service/impl/JobFactory.java | 75 ++ .../nutch/service/impl/JobManagerImpl.java | 95 ++ .../apache/nutch/service/impl/JobWorker.java | 114 +++ .../apache/nutch/service/impl/LinkReader.java | 175 ++++ .../apache/nutch/service/impl/NodeReader.java | 184 ++++ .../service/impl/NutchServerPoolExecutor.java | 131 +++ .../nutch/service/impl/SequenceReader.java | 171 ++++ .../nutch/service/model/request/DbQuery.java | 56 ++ .../nutch/service/model/request/JobConfig.java | 71 ++ .../service/model/request/NutchConfig.java | 51 ++ .../service/model/request/ReaderConfig.java | 30 + .../nutch/service/model/request/SeedList.java | 93 ++ .../nutch/service/model/request/SeedUrl.java | 89 ++ .../service/model/response/FetchNodeDbInfo.java | 103 +++ .../nutch/service/model/response/JobInfo.java | 102 +++ .../service/model/response/NutchServerInfo.java | 55 ++ .../service/resources/AbstractResource.java | 45 + .../nutch/service/resources/AdminResource.java | 85 ++ .../nutch/service/resources/ConfigResource.java | 137 +++ .../nutch/service/resources/DbResource.java | 143 +++ .../nutch/service/resources/JobResource.java | 99 +++ .../nutch/service/resources/ReaderResouce.java | 177 ++++ .../nutch/service/resources/SeedResource.java | 111 +++ .../nutch/tools/AbstractCommonCrawlFormat.java | 393 ++++++++ .../java/org/apache/nutch/tools/Benchmark.java | 284 ++++++ .../apache/nutch/tools/CommonCrawlConfig.java | 147 +++ .../nutch/tools/CommonCrawlDataDumper.java | 716 +++++++++++++++ .../apache/nutch/tools/CommonCrawlFormat.java | 87 ++ .../nutch/tools/CommonCrawlFormatFactory.java | 74 ++ .../nutch/tools/CommonCrawlFormatJackson.java | 109 +++ .../nutch/tools/CommonCrawlFormatJettinson.java | 122 +++ .../nutch/tools/CommonCrawlFormatSimple.java | 174 ++++ .../nutch/tools/CommonCrawlFormatWARC.java | 286 ++++++ .../java/org/apache/nutch/tools/DmozParser.java | 391 ++++++++ .../java/org/apache/nutch/tools/FileDumper.java | 419 +++++++++ .../org/apache/nutch/tools/FreeGenerator.java | 214 +++++ .../org/apache/nutch/tools/ResolveUrls.java | 204 +++++ .../java/org/apache/nutch/tools/WARCUtils.java | 154 ++++ .../apache/nutch/tools/arc/ArcInputFormat.java | 51 ++ .../apache/nutch/tools/arc/ArcRecordReader.java | 299 +++++++ .../nutch/tools/arc/ArcSegmentCreator.java | 426 +++++++++ .../apache/nutch/tools/arc/package-info.java | 23 + .../org/apache/nutch/tools/package-info.java | 22 + .../apache/nutch/tools/warc/WARCExporter.java | 333 +++++++ .../apache/nutch/tools/warc/package-info.java | 23 + .../org/apache/nutch/util/CommandRunner.java | 291 ++++++ .../apache/nutch/util/CrawlCompletionStats.java | 245 +++++ .../org/apache/nutch/util/DeflateUtils.java | 140 +++ .../java/org/apache/nutch/util/DomUtil.java | 104 +++ .../org/apache/nutch/util/DumpFileUtil.java | 147 +++ .../org/apache/nutch/util/EncodingDetector.java | 386 ++++++++ .../java/org/apache/nutch/util/FSUtils.java | 106 +++ .../java/org/apache/nutch/util/GZIPUtils.java | 148 ++++ .../nutch/util/GenericWritableConfigurable.java | 60 ++ .../org/apache/nutch/util/HadoopFSUtil.java | 72 ++ .../java/org/apache/nutch/util/JexlUtil.java | 76 ++ .../java/org/apache/nutch/util/LockUtil.java | 84 ++ .../java/org/apache/nutch/util/MimeUtil.java | 279 ++++++ .../java/org/apache/nutch/util/NodeWalker.java | 129 +++ .../apache/nutch/util/NutchConfiguration.java | 104 +++ .../java/org/apache/nutch/util/NutchJob.java | 30 + .../java/org/apache/nutch/util/NutchTool.java | 109 +++ .../java/org/apache/nutch/util/ObjectCache.java | 56 ++ .../apache/nutch/util/PrefixStringMatcher.java | 119 +++ .../nutch/util/ProtocolStatusStatistics.java | 179 ++++ .../java/org/apache/nutch/util/StringUtil.java | 155 ++++ .../apache/nutch/util/SuffixStringMatcher.java | 114 +++ .../java/org/apache/nutch/util/TableUtil.java | 161 ++++ .../java/org/apache/nutch/util/TimingUtil.java | 72 ++ .../apache/nutch/util/TrieStringMatcher.java | 202 +++++ .../java/org/apache/nutch/util/URLUtil.java | 533 +++++++++++ .../nutch/util/domain/DomainStatistics.java | 234 +++++ .../apache/nutch/util/domain/DomainSuffix.java | 79 ++ .../nutch/util/domain/DomainSuffixes.java | 86 ++ .../nutch/util/domain/DomainSuffixesReader.java | 164 ++++ .../nutch/util/domain/TopLevelDomain.java | 67 ++ .../org/apache/nutch/util/domain/package.html | 14 + .../org/apache/nutch/util/package-info.java | 22 + .../apache/nutch/webui/NutchUiApplication.java | 75 ++ .../nutch/webui/NutchUiApplication.properties | 63 ++ .../org/apache/nutch/webui/NutchUiServer.java | 104 +++ .../apache/nutch/webui/client/NutchClient.java | 49 + .../nutch/webui/client/NutchClientFactory.java | 52 ++ .../nutch/webui/client/impl/CrawlingCycle.java | 82 ++ .../client/impl/CrawlingCycleListener.java | 31 + .../webui/client/impl/NutchClientImpl.java | 99 +++ .../nutch/webui/client/impl/RemoteCommand.java | 76 ++ .../webui/client/impl/RemoteCommandBuilder.java | 64 ++ .../client/impl/RemoteCommandExecutor.java | 110 +++ .../client/impl/RemoteCommandsBatchFactory.java | 97 ++ .../webui/client/model/ConnectionStatus.java | 21 + .../apache/nutch/webui/client/model/Crawl.java | 126 +++ .../nutch/webui/client/model/JobConfig.java | 77 ++ .../nutch/webui/client/model/JobInfo.java | 104 +++ .../nutch/webui/client/model/NutchStatus.java | 62 ++ .../nutch/webui/config/CustomDaoFactory.java | 58 ++ .../nutch/webui/config/CustomTableCreator.java | 83 ++ .../webui/config/NutchGuiConfiguration.java | 33 + .../nutch/webui/config/SpringConfiguration.java | 91 ++ .../apache/nutch/webui/model/NutchConfig.java | 24 + .../apache/nutch/webui/model/NutchInstance.java | 118 +++ .../org/apache/nutch/webui/model/SeedList.java | 106 +++ .../org/apache/nutch/webui/model/SeedUrl.java | 96 ++ .../nutch/webui/pages/AbstractBasePage.html | 33 + .../nutch/webui/pages/AbstractBasePage.java | 206 +++++ .../apache/nutch/webui/pages/DashboardPage.html | 52 ++ .../apache/nutch/webui/pages/DashboardPage.java | 65 ++ .../apache/nutch/webui/pages/LogOutPage.java | 21 + .../nutch/webui/pages/SchedulingPage.java | 21 + .../apache/nutch/webui/pages/SearchPage.java | 21 + .../nutch/webui/pages/StatisticsPage.java | 21 + .../nutch/webui/pages/UrlsUploadPage.java | 21 + .../nutch/webui/pages/UserSettingsPage.java | 21 + .../webui/pages/assets/NutchUiCssReference.java | 39 + .../nutch/webui/pages/assets/nutch-style.css | 149 ++++ .../webui/pages/components/ColorEnumLabel.java | 71 ++ .../pages/components/ColorEnumLabelBuilder.java | 49 + .../pages/components/CpmIteratorAdapter.java | 41 + .../nutch/webui/pages/crawls/CrawlPanel.html | 58 ++ .../nutch/webui/pages/crawls/CrawlPanel.java | 98 ++ .../nutch/webui/pages/crawls/CrawlsPage.html | 90 ++ .../nutch/webui/pages/crawls/CrawlsPage.java | 139 +++ .../webui/pages/instances/InstancePanel.html | 46 + .../webui/pages/instances/InstancePanel.java | 62 ++ .../webui/pages/instances/InstancesPage.html | 66 ++ .../webui/pages/instances/InstancesPage.java | 127 +++ .../nutch/webui/pages/menu/VerticalMenu.html | 48 + .../nutch/webui/pages/menu/VerticalMenu.java | 27 + .../nutch/webui/pages/seed/SeedListsPage.html | 75 ++ .../nutch/webui/pages/seed/SeedListsPage.java | 79 ++ .../apache/nutch/webui/pages/seed/SeedPage.html | 91 ++ .../apache/nutch/webui/pages/seed/SeedPage.java | 153 ++++ .../webui/pages/settings/SettingsPage.html | 43 + .../webui/pages/settings/SettingsPage.java | 59 ++ .../nutch/webui/service/CrawlService.java | 33 + .../webui/service/NutchInstanceService.java | 33 + .../nutch/webui/service/NutchService.java | 31 + .../nutch/webui/service/SeedListService.java | 33 + .../webui/service/impl/CrawlServiceImpl.java | 132 +++ .../service/impl/NutchInstanceServiceImpl.java | 76 ++ .../webui/service/impl/NutchServiceImpl.java | 82 ++ .../webui/service/impl/SeedListServiceImpl.java | 77 ++ nutch-core/src/main/java/overview.html | 9 + .../nutch/crawl/ContinuousCrawlTestUtil.java | 270 ++++++ .../org/apache/nutch/crawl/CrawlDBTestUtil.java | 179 ++++ .../nutch/crawl/CrawlDbUpdateTestDriver.java | 138 +++ .../apache/nutch/crawl/CrawlDbUpdateUtil.java | 166 ++++ .../org/apache/nutch/crawl/DummyWritable.java | 32 + .../nutch/crawl/TODOTestCrawlDbStates.java | 171 ++++ .../nutch/crawl/TestAdaptiveFetchSchedule.java | 121 +++ .../apache/nutch/crawl/TestCrawlDbFilter.java | 148 ++++ .../apache/nutch/crawl/TestCrawlDbMerger.java | 163 ++++ .../apache/nutch/crawl/TestCrawlDbStates.java | 569 ++++++++++++ .../org/apache/nutch/crawl/TestGenerator.java | 373 ++++++++ .../org/apache/nutch/crawl/TestInjector.java | 184 ++++ .../apache/nutch/crawl/TestLinkDbMerger.java | 160 ++++ .../nutch/crawl/TestSignatureFactory.java | 35 + .../org/apache/nutch/fetcher/TestFetcher.java | 210 +++++ .../nutch/indexer/TestIndexerMapReduce.java | 190 ++++ .../nutch/indexer/TestIndexingFilters.java | 113 +++ .../org/apache/nutch/metadata/TestMetadata.java | 281 ++++++ .../metadata/TestSpellCheckedMetadata.java | 303 +++++++ .../org/apache/nutch/net/TestURLFilters.java | 44 + .../apache/nutch/net/TestURLNormalizers.java | 86 ++ .../nutch/parse/TestOutlinkExtractor.java | 99 +++ .../org/apache/nutch/parse/TestParseData.java | 58 ++ .../org/apache/nutch/parse/TestParseText.java | 34 + .../apache/nutch/parse/TestParserFactory.java | 108 +++ .../apache/nutch/parse/parse-plugin-test.xml | 58 ++ .../nutch/plugin/HelloWorldExtension.java | 36 + .../org/apache/nutch/plugin/ITestExtension.java | 27 + .../apache/nutch/plugin/SimpleTestPlugin.java | 57 ++ .../apache/nutch/plugin/TestPluginSystem.java | 305 +++++++ .../org/apache/nutch/protocol/TestContent.java | 94 ++ .../nutch/protocol/TestProtocolFactory.java | 88 ++ .../apache/nutch/segment/TestSegmentMerger.java | 131 +++ .../segment/TestSegmentMergerCrawlDatums.java | 427 +++++++++ .../apache/nutch/service/TestNutchServer.java | 65 ++ .../org/apache/nutch/test/IntegrationTest.java | 6 + .../java/org/apache/nutch/test/TestUtils.java | 29 + .../nutch/tools/TestCommonCrawlDataDumper.java | 126 +++ .../tools/proxy/AbstractTestbedHandler.java | 49 + .../apache/nutch/tools/proxy/DelayHandler.java | 56 ++ .../apache/nutch/tools/proxy/FakeHandler.java | 102 +++ .../nutch/tools/proxy/LogDebugHandler.java | 64 ++ .../nutch/tools/proxy/NotFoundHandler.java | 40 + .../apache/nutch/tools/proxy/ProxyTestbed.java | 156 ++++ .../nutch/tools/proxy/SegmentHandler.java | 255 ++++++ .../apache/nutch/tools/proxy/package-info.java | 22 + .../org/apache/nutch/util/DumpFileUtilTest.java | 68 ++ .../apache/nutch/util/TestEncodingDetector.java | 90 ++ .../org/apache/nutch/util/TestGZIPUtils.java | 241 +++++ .../org/apache/nutch/util/TestMimeUtil.java | 135 +++ .../org/apache/nutch/util/TestNodeWalker.java | 107 +++ .../nutch/util/TestPrefixStringMatcher.java | 115 +++ .../org/apache/nutch/util/TestStringUtil.java | 61 ++ .../nutch/util/TestSuffixStringMatcher.java | 114 +++ .../org/apache/nutch/util/TestTableUtil.java | 75 ++ .../java/org/apache/nutch/util/TestURLUtil.java | 281 ++++++ .../apache/nutch/util/WritableTestUtils.java | 55 ++ nutch-core/src/test/resources/crawl-tests.xml | 62 ++ .../src/test/resources/domain-urlfilter.txt | 22 + .../resources/fetch-test-site/dup_of_pagea.html | 11 + .../resources/fetch-test-site/exception.html | 13 + .../test/resources/fetch-test-site/index.html | 13 + .../fetch-test-site/nested_spider_trap.html | 23 + .../test/resources/fetch-test-site/pagea.html | 11 + .../test/resources/fetch-test-site/pageb.html | 11 + .../test/resources/fetch-test-site/robots.txt | 0 nutch-core/src/test/resources/filter-all.txt | 7 + nutch-core/src/test/resources/log4j.properties | 7 + nutch-core/src/test/resources/nutch-site.xml | 19 + .../src/test/resources/test-mime-util/test.xlsx | Bin 0 -> 3950 bytes .../20150309101625/content/part-00000/.data.crc | Bin 0 -> 124 bytes .../content/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101625/content/part-00000/data | Bin 0 -> 14452 bytes .../20150309101625/content/part-00000/index | Bin 0 -> 217 bytes .../crawl_fetch/part-00000/.data.crc | Bin 0 -> 12 bytes .../crawl_fetch/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101625/crawl_fetch/part-00000/data | Bin 0 -> 293 bytes .../20150309101625/crawl_fetch/part-00000/index | Bin 0 -> 217 bytes .../crawl_generate/.part-00000.crc | Bin 0 -> 12 bytes .../20150309101625/crawl_generate/part-00000 | Bin 0 -> 169 bytes .../20150309101625/crawl_parse/.part-00000.crc | Bin 0 -> 68 bytes .../20150309101625/crawl_parse/part-00000 | Bin 0 -> 7627 bytes .../parse_data/part-00000/.data.crc | Bin 0 -> 24 bytes .../parse_data/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101625/parse_data/part-00000/data | Bin 0 -> 1985 bytes .../20150309101625/parse_data/part-00000/index | Bin 0 -> 217 bytes .../parse_text/part-00000/.data.crc | Bin 0 -> 60 bytes .../parse_text/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101625/parse_text/part-00000/data | Bin 0 -> 6554 bytes .../20150309101625/parse_text/part-00000/index | Bin 0 -> 217 bytes .../20150309101656/content/part-00000/.data.crc | Bin 0 -> 3372 bytes .../content/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101656/content/part-00000/data | Bin 0 -> 430250 bytes .../20150309101656/content/part-00000/index | Bin 0 -> 220 bytes .../crawl_fetch/part-00000/.data.crc | Bin 0 -> 104 bytes .../crawl_fetch/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101656/crawl_fetch/part-00000/data | Bin 0 -> 12121 bytes .../20150309101656/crawl_fetch/part-00000/index | Bin 0 -> 220 bytes .../crawl_generate/.part-00000.crc | Bin 0 -> 52 bytes .../20150309101656/crawl_generate/part-00000 | Bin 0 -> 5590 bytes .../20150309101656/crawl_parse/.part-00000.crc | Bin 0 -> 1652 bytes .../20150309101656/crawl_parse/part-00000 | Bin 0 -> 210047 bytes .../parse_data/part-00000/.data.crc | Bin 0 -> 460 bytes .../parse_data/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101656/parse_data/part-00000/data | Bin 0 -> 57355 bytes .../20150309101656/parse_data/part-00000/index | Bin 0 -> 220 bytes .../parse_text/part-00000/.data.crc | Bin 0 -> 1260 bytes .../parse_text/part-00000/.index.crc | Bin 0 -> 12 bytes .../20150309101656/parse_text/part-00000/data | Bin 0 -> 159920 bytes .../20150309101656/parse_text/part-00000/index | Bin 0 -> 220 bytes nutch-plugins/build-plugin.xml | 255 ++++++ nutch-plugins/build.xml | 213 +++++ nutch-plugins/creativecommons/README.txt | 1 + nutch-plugins/creativecommons/build.xml | 28 + .../creativecommons/conf/crawl-urlfilter.txt | 18 + .../creativecommons/conf/nutch-site.xml | 50 ++ nutch-plugins/creativecommons/ivy.xml | 41 + nutch-plugins/creativecommons/plugin.xml | 48 + nutch-plugins/creativecommons/pom.xml | 38 + .../creativecommons/nutch/CCIndexingFilter.java | 124 +++ .../creativecommons/nutch/CCParseFilter.java | 300 +++++++ .../java/org/creativecommons/nutch/package.html | 5 + .../nutch/TestCCParseFilter.java | 73 ++ .../src/test/resources/anchor.html | 9 + .../creativecommons/src/test/resources/rdf.html | 35 + .../creativecommons/src/test/resources/rel.html | 6 + nutch-plugins/feed/build.xml | 45 + nutch-plugins/feed/ivy.xml | 43 + nutch-plugins/feed/plugin.xml | 49 + nutch-plugins/feed/pom.xml | 45 + .../nutch/indexer/feed/FeedIndexingFilter.java | 129 +++ .../apache/nutch/indexer/feed/package-info.java | 22 + .../org/apache/nutch/parse/feed/FeedParser.java | 374 ++++++++ .../apache/nutch/parse/feed/package-info.java | 22 + .../apache/nutch/parse/feed/TestFeedParser.java | 124 +++ .../feed/src/test/resources/rsstest.rss | 36 + nutch-plugins/headings/build.xml | 22 + nutch-plugins/headings/ivy.xml | 41 + nutch-plugins/headings/plugin.xml | 45 + nutch-plugins/headings/pom.xml | 38 + .../parse/headings/HeadingsParseFilter.java | 124 +++ .../nutch/parse/headings/package-info.java | 22 + nutch-plugins/index-anchor/build.xml | 22 + nutch-plugins/index-anchor/ivy.xml | 41 + nutch-plugins/index-anchor/plugin.xml | 38 + nutch-plugins/index-anchor/pom.xml | 38 + .../indexer/anchor/AnchorIndexingFilter.java | 107 +++ .../apache/nutch/indexer/anchor/package.html | 5 + .../anchor/TestAnchorIndexingFilter.java | 67 ++ nutch-plugins/index-basic/build.xml | 22 + nutch-plugins/index-basic/ivy.xml | 41 + nutch-plugins/index-basic/plugin.xml | 42 + nutch-plugins/index-basic/pom.xml | 38 + .../indexer/basic/BasicIndexingFilter.java | 158 ++++ .../org/apache/nutch/indexer/basic/package.html | 5 + .../indexer/basic/TestBasicIndexingFilter.java | 99 +++ nutch-plugins/index-geoip/build-ivy.xml | 54 ++ nutch-plugins/index-geoip/build.xml | 27 + nutch-plugins/index-geoip/ivy.xml | 46 + nutch-plugins/index-geoip/plugin.xml | 51 ++ nutch-plugins/index-geoip/pom.xml | 55 ++ .../indexer/geoip/GeoIPDocumentCreator.java | 210 +++++ .../indexer/geoip/GeoIPIndexingFilter.java | 241 +++++ .../nutch/indexer/geoip/package-info.java | 28 + nutch-plugins/index-links/build.xml | 22 + nutch-plugins/index-links/ivy.xml | 41 + nutch-plugins/index-links/plugin.xml | 41 + nutch-plugins/index-links/pom.xml | 38 + .../indexer/links/LinksIndexingFilter.java | 167 ++++ .../indexer/links/TestLinksIndexingFilter.java | 218 +++++ .../org/apache/nutch/parse/TestOutlinks.java | 54 ++ nutch-plugins/index-metadata/build.xml | 22 + nutch-plugins/index-metadata/ivy.xml | 41 + nutch-plugins/index-metadata/plugin.xml | 42 + nutch-plugins/index-metadata/pom.xml | 38 + .../nutch/indexer/metadata/MetadataIndexer.java | 104 +++ .../nutch/indexer/metadata/package-info.java | 23 + nutch-plugins/index-more/build.xml | 22 + nutch-plugins/index-more/ivy.xml | 41 + nutch-plugins/index-more/plugin.xml | 42 + nutch-plugins/index-more/pom.xml | 38 + .../nutch/indexer/more/MoreIndexingFilter.java | 344 +++++++ .../org/apache/nutch/indexer/more/package.html | 6 + .../indexer/more/TestMoreIndexingFilter.java | 123 +++ nutch-plugins/index-replace/README.txt | 95 ++ nutch-plugins/index-replace/build.xml | 55 ++ nutch-plugins/index-replace/ivy.xml | 41 + nutch-plugins/index-replace/plugin.xml | 22 + nutch-plugins/index-replace/pom.xml | 50 ++ .../nutch/indexer/replace/FieldReplacer.java | 196 ++++ .../nutch/indexer/replace/ReplaceIndexer.java | 330 +++++++ .../nutch/indexer/replace/package-info.java | 22 + .../nutch/indexer/replace/TestIndexReplace.java | 456 ++++++++++ .../src/test/resources/testIndexReplace.html | 12 + nutch-plugins/index-static/build.xml | 22 + nutch-plugins/index-static/ivy.xml | 41 + nutch-plugins/index-static/plugin.xml | 42 + nutch-plugins/index-static/pom.xml | 38 + .../indexer/staticfield/StaticFieldIndexer.java | 143 +++ .../nutch/indexer/staticfield/package.html | 5 + .../staticfield/TestStaticFieldIndexerTest.java | 194 ++++ nutch-plugins/indexer-cloudsearch/README.md | 58 ++ nutch-plugins/indexer-cloudsearch/build.xml | 22 + .../indexer-cloudsearch/createCSDomain.sh | 22 + nutch-plugins/indexer-cloudsearch/ivy.xml | 41 + nutch-plugins/indexer-cloudsearch/plugin.xml | 50 ++ nutch-plugins/indexer-cloudsearch/pom.xml | 45 + .../cloudsearch/CloudSearchConstants.java | 27 + .../cloudsearch/CloudSearchIndexWriter.java | 382 ++++++++ .../cloudsearch/CloudSearchUtils.java | 73 ++ nutch-plugins/indexer-dummy/build.xml | 22 + nutch-plugins/indexer-dummy/ivy.xml | 41 + nutch-plugins/indexer-dummy/plugin.xml | 38 + nutch-plugins/indexer-dummy/pom.xml | 38 + .../indexwriter/dummy/DummyIndexWriter.java | 103 +++ .../nutch/indexwriter/dummy/package-info.java | 23 + nutch-plugins/indexer-elastic/build-ivy.xml | 54 ++ nutch-plugins/indexer-elastic/build.xml | 22 + .../indexer-elastic/howto_upgrade_es.txt | 6 + nutch-plugins/indexer-elastic/ivy.xml | 43 + nutch-plugins/indexer-elastic/plugin.xml | 71 ++ nutch-plugins/indexer-elastic/pom.xml | 45 + .../indexwriter/elastic/ElasticConstants.java | 28 + .../indexwriter/elastic/ElasticIndexWriter.java | 279 ++++++ .../nutch/indexwriter/elastic/package-info.java | 22 + nutch-plugins/indexer-solr/build-ivy.xml | 54 ++ nutch-plugins/indexer-solr/build.xml | 22 + nutch-plugins/indexer-solr/ivy.xml | 44 + nutch-plugins/indexer-solr/plugin.xml | 48 + nutch-plugins/indexer-solr/pom.xml | 55 ++ .../nutch/indexwriter/solr/SolrConstants.java | 56 ++ .../nutch/indexwriter/solr/SolrIndexWriter.java | 277 ++++++ .../indexwriter/solr/SolrMappingReader.java | 147 +++ .../nutch/indexwriter/solr/SolrUtils.java | 97 ++ .../nutch/indexwriter/solr/package-info.java | 22 + nutch-plugins/language-identifier/build.xml | 38 + nutch-plugins/language-identifier/ivy.xml | 41 + nutch-plugins/language-identifier/plugin.xml | 49 + nutch-plugins/language-identifier/pom.xml | 38 + .../nutch/analysis/lang/HTMLLanguageParser.java | 320 +++++++ .../analysis/lang/LanguageIndexingFilter.java | 89 ++ .../nutch/analysis/lang/langmappings.properties | 188 ++++ .../org/apache/nutch/analysis/lang/package.html | 6 + .../analysis/lang/TestHTMLLanguageParser.java | 149 ++++ .../java/org/apache/nutch/analysis/lang/da.test | 108 +++ .../java/org/apache/nutch/analysis/lang/de.test | 104 +++ .../java/org/apache/nutch/analysis/lang/el.test | 109 +++ .../java/org/apache/nutch/analysis/lang/en.test | 105 +++ .../java/org/apache/nutch/analysis/lang/es.test | 107 +++ .../java/org/apache/nutch/analysis/lang/fi.test | 106 +++ .../java/org/apache/nutch/analysis/lang/fr.test | 105 +++ .../java/org/apache/nutch/analysis/lang/it.test | 109 +++ .../java/org/apache/nutch/analysis/lang/nl.test | 105 +++ .../java/org/apache/nutch/analysis/lang/pt.test | 105 +++ .../java/org/apache/nutch/analysis/lang/sv.test | 108 +++ .../nutch/analysis/lang/test-referencial.txt | 10 + nutch-plugins/lib-htmlunit/build-ivy.xml | 54 ++ nutch-plugins/lib-htmlunit/build.xml | 28 + nutch-plugins/lib-htmlunit/ivy.xml | 52 ++ nutch-plugins/lib-htmlunit/plugin.xml | 166 ++++ nutch-plugins/lib-htmlunit/pom.xml | 55 ++ .../protocol/htmlunit/HtmlUnitWebDriver.java | 189 ++++ .../htmlunit/HtmlUnitWebWindowListener.java | 53 ++ nutch-plugins/lib-http/build.xml | 22 + nutch-plugins/lib-http/ivy.xml | 41 + nutch-plugins/lib-http/plugin.xml | 33 + nutch-plugins/lib-http/pom.xml | 38 + .../protocol/http/api/BlockedException.java | 26 + .../nutch/protocol/http/api/HttpBase.java | 587 ++++++++++++ .../nutch/protocol/http/api/HttpException.java | 40 + .../protocol/http/api/HttpRobotRulesParser.java | 167 ++++ .../apache/nutch/protocol/http/api/package.html | 6 + .../protocol/http/api/TestRobotRulesParser.java | 123 +++ nutch-plugins/lib-nekohtml/build.xml | 30 + nutch-plugins/lib-nekohtml/ivy.xml | 42 + nutch-plugins/lib-nekohtml/plugin.xml | 38 + nutch-plugins/lib-nekohtml/pom.xml | 45 + nutch-plugins/lib-regex-filter/build.xml | 22 + nutch-plugins/lib-regex-filter/ivy.xml | 41 + nutch-plugins/lib-regex-filter/plugin.xml | 33 + nutch-plugins/lib-regex-filter/pom.xml | 54 ++ .../apache/nutch/urlfilter/api/RegexRule.java | 102 +++ .../nutch/urlfilter/api/RegexURLFilterBase.java | 315 +++++++ .../nutch/urlfilter/api/package-info.java | 23 + .../urlfilter/api/RegexURLFilterBaseTest.java | 134 +++ nutch-plugins/lib-selenium/build-ivy.xml | 54 ++ nutch-plugins/lib-selenium/build.xml | 28 + .../lib-selenium/howto_upgrade_selenium.txt | 15 + nutch-plugins/lib-selenium/ivy.xml | 52 ++ nutch-plugins/lib-selenium/plugin.xml | 175 ++++ nutch-plugins/lib-selenium/pom.xml | 49 + .../nutch/protocol/selenium/HttpWebClient.java | 236 +++++ nutch-plugins/lib-xml/build.xml | 36 + nutch-plugins/lib-xml/ivy.xml | 44 + nutch-plugins/lib-xml/plugin.xml | 65 ++ nutch-plugins/lib-xml/pom.xml | 38 + nutch-plugins/microformats-reltag/build.xml | 27 + nutch-plugins/microformats-reltag/ivy.xml | 41 + nutch-plugins/microformats-reltag/plugin.xml | 49 + nutch-plugins/microformats-reltag/pom.xml | 38 + .../reltag/RelTagIndexingFilter.java | 77 ++ .../nutch/microformats/reltag/RelTagParser.java | 148 ++++ .../nutch/microformats/reltag/package.html | 8 + nutch-plugins/mimetype-filter/build.xml | 28 + nutch-plugins/mimetype-filter/ivy.xml | 41 + nutch-plugins/mimetype-filter/plugin.xml | 37 + nutch-plugins/mimetype-filter/pom.xml | 38 + .../indexer/filter/MimeTypeIndexingFilter.java | 273 ++++++ .../filter/MimeTypeIndexingFilterTest.java | 114 +++ .../src/test/resources/allow-images.txt | 34 + .../src/test/resources/block-html.txt | 34 + nutch-plugins/nutch-extensionpoints/build.xml | 30 + nutch-plugins/nutch-extensionpoints/ivy.xml | 41 + nutch-plugins/nutch-extensionpoints/plugin.xml | 67 ++ nutch-plugins/nutch-extensionpoints/pom.xml | 38 + nutch-plugins/parse-ext/build.xml | 32 + nutch-plugins/parse-ext/command | 24 + nutch-plugins/parse-ext/ivy.xml | 41 + nutch-plugins/parse-ext/plugin.xml | 60 ++ nutch-plugins/parse-ext/pom.xml | 38 + .../org/apache/nutch/parse/ext/ExtParser.java | 183 ++++ .../apache/nutch/parse/ext/package-info.java | 22 + .../apache/nutch/parse/ext/TestExtParser.java | 130 +++ nutch-plugins/parse-html/build.xml | 40 + nutch-plugins/parse-html/ivy.xml | 42 + nutch-plugins/parse-html/plugin.xml | 48 + nutch-plugins/parse-html/pom.xml | 49 + .../org/apache/nutch/parse/html/DOMBuilder.java | 766 ++++++++++++++++ .../nutch/parse/html/DOMContentUtils.java | 400 +++++++++ .../nutch/parse/html/HTMLMetaProcessor.java | 214 +++++ .../org/apache/nutch/parse/html/HtmlParser.java | 352 ++++++++ .../parse/html/XMLCharacterRecognizer.java | 112 +++ .../org/apache/nutch/parse/html/package.html | 5 + .../nutch/parse/html/TestDOMContentUtils.java | 347 ++++++++ .../apache/nutch/parse/html/TestHtmlParser.java | 122 +++ .../parse/html/TestRobotsMetaProcessor.java | 155 ++++ nutch-plugins/parse-js/build.xml | 22 + nutch-plugins/parse-js/ivy.xml | 41 + nutch-plugins/parse-js/plugin.xml | 53 ++ nutch-plugins/parse-js/pom.xml | 38 + .../apache/nutch/parse/js/JSParseFilter.java | 301 +++++++ .../org/apache/nutch/parse/js/package-info.java | 23 + nutch-plugins/parse-metatags/README.txt | 17 + nutch-plugins/parse-metatags/build.xml | 37 + nutch-plugins/parse-metatags/ivy.xml | 41 + nutch-plugins/parse-metatags/plugin.xml | 22 + nutch-plugins/parse-metatags/pom.xml | 38 + .../nutch/parse/metatags/MetaTagsParser.java | 124 +++ .../nutch/parse/metatags/package-info.java | 24 + .../nutch/parse/metatags/TestMetatagParser.java | 104 +++ .../src/test/resources/testMetatags.html | 9 + .../test/resources/testMultivalueMetatags.html | 12 + nutch-plugins/parse-replace/README.txt | 91 ++ nutch-plugins/parse-replace/build.xml | 37 + nutch-plugins/parse-replace/ivy.xml | 41 + nutch-plugins/parse-replace/plugin.xml | 22 + nutch-plugins/parse-replace/pom.xml | 38 + .../nutch/parse/replace/ReplaceParser.java | 74 ++ .../nutch/parse/replace/package-info.java | 22 + .../nutch/parse/replace/TestParseReplace.java | 68 ++ .../src/test/resources/testParseReplace.html | 11 + nutch-plugins/parse-swf/build.xml | 38 + nutch-plugins/parse-swf/ivy.xml | 41 + nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt | 33 + nutch-plugins/parse-swf/lib/javaswf.jar | Bin 0 -> 125369 bytes nutch-plugins/parse-swf/plugin.xml | 44 + nutch-plugins/parse-swf/pom.xml | 46 + .../org/apache/nutch/parse/swf/SWFParser.java | 685 ++++++++++++++ .../apache/nutch/parse/swf/package-info.java | 22 + .../apache/nutch/parse/swf/TestSWFParser.java | 94 ++ .../parse-swf/src/test/resources/test1.swf | Bin 0 -> 21054 bytes .../parse-swf/src/test/resources/test1.txt | 60 ++ .../parse-swf/src/test/resources/test2.swf | Bin 0 -> 42534 bytes .../parse-swf/src/test/resources/test2.txt | 5 + .../parse-swf/src/test/resources/test3.swf | Bin 0 -> 51562 bytes .../parse-swf/src/test/resources/test3.txt | 11 + nutch-plugins/parse-tika/build-ivy.xml | 54 ++ nutch-plugins/parse-tika/build.xml | 55 ++ nutch-plugins/parse-tika/howto_upgrade_tika.txt | 8 + nutch-plugins/parse-tika/ivy.xml | 46 + nutch-plugins/parse-tika/plugin.xml | 136 +++ nutch-plugins/parse-tika/pom.xml | 54 ++ .../tika/BoilerpipeExtractorRepository.java | 62 ++ .../org/apache/nutch/parse/tika/DOMBuilder.java | 794 +++++++++++++++++ .../nutch/parse/tika/DOMContentUtils.java | 402 +++++++++ .../nutch/parse/tika/HTMLMetaProcessor.java | 214 +++++ .../org/apache/nutch/parse/tika/TikaParser.java | 286 ++++++ .../parse/tika/XMLCharacterRecognizer.java | 112 +++ .../apache/nutch/parse/tika/package-info.java | 23 + .../apache/nutch/tika/TestDOMContentUtils.java | 337 +++++++ .../org/apache/nutch/tika/TestFeedParser.java | 121 +++ .../apache/nutch/tika/TestImageMetadata.java | 67 ++ .../org/apache/nutch/tika/TestMSWordParser.java | 92 ++ .../org/apache/nutch/tika/TestOOParser.java | 107 +++ .../org/apache/nutch/tika/TestPdfParser.java | 73 ++ .../org/apache/nutch/tika/TestRTFParser.java | 81 ++ .../nutch/tika/TestRobotsMetaProcessor.java | 156 ++++ .../parse-tika/src/test/resources/encrypted.pdf | Bin 0 -> 3431 bytes .../parse-tika/src/test/resources/nutch.html | 519 +++++++++++ .../src/test/resources/nutch_logo_tm.gif | Bin 0 -> 2747 bytes .../parse-tika/src/test/resources/ootest.odt | Bin 0 -> 20753 bytes .../parse-tika/src/test/resources/ootest.sxw | Bin 0 -> 20125 bytes .../parse-tika/src/test/resources/ootest.txt | 30 + .../parse-tika/src/test/resources/pdftest.pdf | 157 ++++ .../parse-tika/src/test/resources/rsstest.rss | 37 + .../parse-tika/src/test/resources/test.rtf | 17 + .../parse-tika/src/test/resources/word97.doc | Bin 0 -> 8192 bytes nutch-plugins/parse-zip/build.xml | 38 + nutch-plugins/parse-zip/ivy.xml | 41 + nutch-plugins/parse-zip/plugin.xml | 46 + nutch-plugins/parse-zip/pom.xml | 38 + .../org/apache/nutch/parse/zip/ZipParser.java | 144 +++ .../nutch/parse/zip/ZipTextExtractor.java | 120 +++ .../apache/nutch/parse/zip/package-info.java | 22 + .../apache/nutch/parse/zip/TestZipParser.java | 71 ++ .../parse-zip/src/test/resources/test.zip | Bin 0 -> 182 bytes .../parsefilter-naivebayes/build-ivy.xml | 54 ++ nutch-plugins/parsefilter-naivebayes/build.xml | 22 + nutch-plugins/parsefilter-naivebayes/ivy.xml | 49 + nutch-plugins/parsefilter-naivebayes/plugin.xml | 56 ++ nutch-plugins/parsefilter-naivebayes/pom.xml | 38 + .../nutch/parsefilter/naivebayes/Classify.java | 120 +++ .../naivebayes/NaiveBayesParseFilter.java | 197 ++++ .../nutch/parsefilter/naivebayes/Train.java | 148 ++++ .../parsefilter/naivebayes/package-info.java | 28 + nutch-plugins/parsefilter-regex/build.xml | 27 + nutch-plugins/parsefilter-regex/ivy.xml | 37 + nutch-plugins/parsefilter-regex/plugin.xml | 42 + nutch-plugins/parsefilter-regex/pom.xml | 38 + .../parsefilter/regex/RegexParseFilter.java | 199 +++++ .../nutch/parsefilter/regex/package-info.java | 23 + .../parsefilter/regex/TestRegexParseFilter.java | 77 ++ .../src/test/resources/regex-parsefilter.txt | 10 + nutch-plugins/plugin.dtd | 206 +++++ nutch-plugins/plugin/pom.xml | 38 + nutch-plugins/pom.xml | 164 ++++ nutch-plugins/protocol-file/build.xml | 29 + nutch-plugins/protocol-file/ivy.xml | 41 + nutch-plugins/protocol-file/plugin.xml | 46 + nutch-plugins/protocol-file/pom.xml | 38 + .../org/apache/nutch/protocol/file/File.java | 228 +++++ .../apache/nutch/protocol/file/FileError.java | 36 + .../nutch/protocol/file/FileException.java | 40 + .../nutch/protocol/file/FileResponse.java | 317 +++++++ .../org/apache/nutch/protocol/file/package.html | 5 + .../nutch/protocol/file/TestProtocolFile.java | 99 +++ .../src/test/resources/testprotocolfile.txt | 1 + .../resources/testprotocolfile_(encoded).txt | 1 + nutch-plugins/protocol-ftp/build.xml | 22 + nutch-plugins/protocol-ftp/ivy.xml | 42 + nutch-plugins/protocol-ftp/plugin.xml | 46 + nutch-plugins/protocol-ftp/pom.xml | 38 + .../org/apache/nutch/protocol/ftp/Client.java | 595 +++++++++++++ .../java/org/apache/nutch/protocol/ftp/Ftp.java | 267 ++++++ .../org/apache/nutch/protocol/ftp/FtpError.java | 36 + .../apache/nutch/protocol/ftp/FtpException.java | 46 + .../ftp/FtpExceptionBadSystResponse.java | 29 + .../FtpExceptionCanNotHaveDataConnection.java | 29 + ...ExceptionControlClosedByForcedDataClose.java | 30 + .../ftp/FtpExceptionUnknownForcedDataClose.java | 30 + .../apache/nutch/protocol/ftp/FtpResponse.java | 521 +++++++++++ .../nutch/protocol/ftp/FtpRobotRulesParser.java | 121 +++ .../protocol/ftp/PrintCommandListener.java | 71 ++ .../org/apache/nutch/protocol/ftp/package.html | 5 + nutch-plugins/protocol-htmlunit/build.xml | 37 + nutch-plugins/protocol-htmlunit/ivy.xml | 38 + nutch-plugins/protocol-htmlunit/plugin.xml | 51 ++ nutch-plugins/protocol-htmlunit/pom.xml | 51 ++ .../apache/nutch/protocol/htmlunit/Http.java | 63 ++ .../nutch/protocol/htmlunit/HttpResponse.java | 573 ++++++++++++ .../apache/nutch/protocol/htmlunit/package.html | 21 + nutch-plugins/protocol-http/build.xml | 50 ++ nutch-plugins/protocol-http/ivy.xml | 41 + nutch-plugins/protocol-http/jsp/basic-http.jsp | 44 + nutch-plugins/protocol-http/jsp/brokenpage.jsp | 47 + nutch-plugins/protocol-http/jsp/redirect301.jsp | 49 + nutch-plugins/protocol-http/jsp/redirect302.jsp | 49 + nutch-plugins/protocol-http/plugin.xml | 51 ++ nutch-plugins/protocol-http/pom.xml | 57 ++ .../org/apache/nutch/protocol/http/Http.java | 73 ++ .../nutch/protocol/http/HttpResponse.java | 558 ++++++++++++ .../org/apache/nutch/protocol/http/package.html | 5 + .../src/test/conf/nutch-site-test.xml | 52 ++ .../nutch/protocol/http/TestProtocolHttp.java | 140 +++ nutch-plugins/protocol-httpclient/build.xml | 45 + nutch-plugins/protocol-httpclient/ivy.xml | 42 + nutch-plugins/protocol-httpclient/jsp/basic.jsp | 74 ++ .../protocol-httpclient/jsp/cookies.jsp | 63 ++ .../protocol-httpclient/jsp/digest.jsp | 68 ++ .../protocol-httpclient/jsp/noauth.jsp | 36 + nutch-plugins/protocol-httpclient/jsp/ntlm.jsp | 89 ++ nutch-plugins/protocol-httpclient/plugin.xml | 58 ++ nutch-plugins/protocol-httpclient/pom.xml | 62 ++ .../DummySSLProtocolSocketFactory.java | 163 ++++ .../httpclient/DummyX509TrustManager.java | 92 ++ .../apache/nutch/protocol/httpclient/Http.java | 572 ++++++++++++ .../protocol/httpclient/HttpAuthentication.java | 45 + .../httpclient/HttpAuthenticationException.java | 71 ++ .../httpclient/HttpAuthenticationFactory.java | 98 ++ .../httpclient/HttpBasicAuthentication.java | 199 +++++ .../httpclient/HttpFormAuthConfigurer.java | 106 +++ .../httpclient/HttpFormAuthentication.java | 223 +++++ .../nutch/protocol/httpclient/HttpResponse.java | 216 +++++ .../nutch/protocol/httpclient/package.html | 9 + .../src/test/conf/httpclient-auth-test.xml | 58 ++ .../src/test/conf/nutch-site-test.xml | 52 ++ .../httpclient/TestProtocolHttpClient.java | 217 +++++ .../protocol-interactiveselenium/README.md | 38 + .../protocol-interactiveselenium/build-ivy.xml | 54 ++ .../protocol-interactiveselenium/build.xml | 37 + .../protocol-interactiveselenium/ivy.xml | 42 + .../protocol-interactiveselenium/plugin.xml | 47 + .../protocol-interactiveselenium/pom.xml | 50 ++ .../protocol/interactiveselenium/Http.java | 59 ++ .../interactiveselenium/HttpResponse.java | 399 +++++++++ .../DefalultMultiInteractionHandler.java | 53 ++ .../DefaultClickAllAjaxLinksHandler.java | 88 ++ .../handlers/DefaultHandler.java | 30 + .../handlers/InteractiveSeleniumHandler.java | 25 + .../protocol/interactiveselenium/package.html | 5 + nutch-plugins/protocol-selenium/README.md | 208 +++++ nutch-plugins/protocol-selenium/build-ivy.xml | 54 ++ nutch-plugins/protocol-selenium/build.xml | 36 + nutch-plugins/protocol-selenium/ivy.xml | 42 + nutch-plugins/protocol-selenium/plugin.xml | 47 + nutch-plugins/protocol-selenium/pom.xml | 50 ++ .../apache/nutch/protocol/selenium/Http.java | 59 ++ .../nutch/protocol/selenium/HttpResponse.java | 360 ++++++++ .../apache/nutch/protocol/selenium/package.html | 5 + nutch-plugins/scoring-depth/build.xml | 6 + nutch-plugins/scoring-depth/ivy.xml | 41 + nutch-plugins/scoring-depth/plugin.xml | 24 + nutch-plugins/scoring-depth/pom.xml | 38 + .../nutch/scoring/depth/DepthScoringFilter.java | 207 +++++ .../nutch/scoring/depth/package-info.java | 23 + nutch-plugins/scoring-link/build.xml | 27 + nutch-plugins/scoring-link/ivy.xml | 41 + nutch-plugins/scoring-link/plugin.xml | 39 + nutch-plugins/scoring-link/pom.xml | 38 + .../scoring/link/LinkAnalysisScoringFilter.java | 95 ++ .../apache/nutch/scoring/link/package-info.java | 23 + nutch-plugins/scoring-opic/build.xml | 27 + nutch-plugins/scoring-opic/ivy.xml | 41 + nutch-plugins/scoring-opic/plugin.xml | 39 + nutch-plugins/scoring-opic/pom.xml | 38 + .../nutch/scoring/opic/OPICScoringFilter.java | 173 ++++ .../apache/nutch/scoring/opic/package-info.java | 23 + nutch-plugins/scoring-similarity/build-ivy.xml | 54 ++ nutch-plugins/scoring-similarity/build.xml | 27 + nutch-plugins/scoring-similarity/ivy.xml | 42 + nutch-plugins/scoring-similarity/plugin.xml | 45 + nutch-plugins/scoring-similarity/pom.xml | 45 + .../scoring/similarity/SimilarityModel.java | 38 + .../similarity/SimilarityScoringFilter.java | 70 ++ .../similarity/cosine/CosineSimilarity.java | 84 ++ .../scoring/similarity/cosine/DocVector.java | 57 ++ .../nutch/scoring/similarity/cosine/Model.java | 190 ++++ .../scoring/similarity/cosine/package-info.java | 7 + .../similarity/util/LuceneAnalyzerUtil.java | 93 ++ .../similarity/util/LuceneTokenizer.java | 166 ++++ .../scoring/similarity/util/package-info.java | 24 + nutch-plugins/subcollection/README.txt | 10 + nutch-plugins/subcollection/build.xml | 22 + nutch-plugins/subcollection/ivy.xml | 41 + nutch-plugins/subcollection/plugin.xml | 41 + nutch-plugins/subcollection/pom.xml | 38 + .../nutch/collection/CollectionManager.java | 240 +++++ .../apache/nutch/collection/Subcollection.java | 259 ++++++ .../org/apache/nutch/collection/package.html | 36 + .../SubcollectionIndexingFilter.java | 101 +++ .../indexer/subcollection/package-info.java | 25 + .../nutch/collection/TestSubcollection.java | 112 +++ nutch-plugins/tld/build.xml | 22 + nutch-plugins/tld/ivy.xml | 41 + nutch-plugins/tld/plugin.xml | 51 ++ nutch-plugins/tld/pom.xml | 38 + .../nutch/indexer/tld/TLDIndexingFilter.java | 69 ++ .../org/apache/nutch/indexer/tld/package.html | 5 + .../nutch/scoring/tld/TLDScoringFilter.java | 114 +++ .../org/apache/nutch/scoring/tld/package.html | 5 + nutch-plugins/urlfilter-automaton/build.xml | 51 ++ nutch-plugins/urlfilter-automaton/ivy.xml | 42 + nutch-plugins/urlfilter-automaton/plugin.xml | 43 + nutch-plugins/urlfilter-automaton/pom.xml | 58 ++ .../urlfilter/automaton/AutomatonURLFilter.java | 116 +++ .../nutch/urlfilter/automaton/package.html | 9 + .../automaton/TestAutomatonURLFilter.java | 56 ++ .../src/test/resources/Benchmarks.rules | 26 + .../src/test/resources/Benchmarks.urls | 297 +++++++ .../src/test/resources/IntranetCrawling.rules | 24 + .../src/test/resources/IntranetCrawling.urls | 8 + .../src/test/resources/WholeWebCrawling.rules | 19 + .../src/test/resources/WholeWebCrawling.urls | 11 + nutch-plugins/urlfilter-domain/build.xml | 28 + nutch-plugins/urlfilter-domain/ivy.xml | 41 + nutch-plugins/urlfilter-domain/plugin.xml | 43 + nutch-plugins/urlfilter-domain/pom.xml | 38 + .../nutch/urlfilter/domain/DomainURLFilter.java | 212 +++++ .../nutch/urlfilter/domain/package-info.java | 25 + .../urlfilter/domain/TestDomainURLFilter.java | 67 ++ .../src/test/resources/hosts.txt | 5 + .../urlfilter-domainblacklist/build.xml | 28 + nutch-plugins/urlfilter-domainblacklist/ivy.xml | 41 + .../urlfilter-domainblacklist/plugin.xml | 43 + nutch-plugins/urlfilter-domainblacklist/pom.xml | 38 + .../DomainBlacklistURLFilter.java | 210 +++++ .../urlfilter/domainblacklist/package-info.java | 24 + .../TestDomainBlacklistURLFilter.java | 49 + .../src/test/resources/hosts.txt | 5 + nutch-plugins/urlfilter-ignoreexempt/README.md | 43 + nutch-plugins/urlfilter-ignoreexempt/build.xml | 55 ++ nutch-plugins/urlfilter-ignoreexempt/ivy.xml | 41 + nutch-plugins/urlfilter-ignoreexempt/plugin.xml | 45 + nutch-plugins/urlfilter-ignoreexempt/pom.xml | 45 + .../ignoreexempt/ExemptionUrlFilter.java | 101 +++ .../urlfilter/ignoreexempt/package-info.java | 24 + nutch-plugins/urlfilter-prefix/build.xml | 22 + nutch-plugins/urlfilter-prefix/ivy.xml | 41 + nutch-plugins/urlfilter-prefix/plugin.xml | 47 + nutch-plugins/urlfilter-prefix/pom.xml | 38 + .../nutch/urlfilter/prefix/PrefixURLFilter.java | 178 ++++ .../apache/nutch/urlfilter/prefix/package.html | 5 + .../urlfilter/prefix/TestPrefixURLFilter.java | 79 ++ nutch-plugins/urlfilter-regex/build.xml | 51 ++ nutch-plugins/urlfilter-regex/ivy.xml | 41 + nutch-plugins/urlfilter-regex/plugin.xml | 48 + nutch-plugins/urlfilter-regex/pom.xml | 53 ++ .../nutch/urlfilter/regex/RegexURLFilter.java | 111 +++ .../apache/nutch/urlfilter/regex/package.html | 5 + .../urlfilter/regex/TestRegexURLFilter.java | 61 ++ .../src/test/resources/Benchmarks.rules | 26 + .../src/test/resources/Benchmarks.urls | 297 +++++++ .../src/test/resources/IntranetCrawling.rules | 27 + .../src/test/resources/IntranetCrawling.urls | 8 + .../src/test/resources/WholeWebCrawling.rules | 22 + .../src/test/resources/WholeWebCrawling.urls | 11 + .../src/test/resources/nutch1838.rules | 12 + .../src/test/resources/nutch1838.urls | 3 + nutch-plugins/urlfilter-suffix/build.xml | 22 + nutch-plugins/urlfilter-suffix/ivy.xml | 41 + nutch-plugins/urlfilter-suffix/plugin.xml | 47 + nutch-plugins/urlfilter-suffix/pom.xml | 38 + .../nutch/urlfilter/suffix/SuffixURLFilter.java | 331 +++++++ .../nutch/urlfilter/suffix/package-info.java | 23 + .../urlfilter/suffix/TestSuffixURLFilter.java | 123 +++ nutch-plugins/urlfilter-validator/build.xml | 22 + nutch-plugins/urlfilter-validator/ivy.xml | 41 + nutch-plugins/urlfilter-validator/plugin.xml | 41 + nutch-plugins/urlfilter-validator/pom.xml | 38 + .../nutch/urlfilter/validator/UrlValidator.java | 386 ++++++++ .../nutch/urlfilter/validator/package.html | 9 + .../urlfilter/validator/TestUrlValidator.java | 79 ++ nutch-plugins/urlmeta/build.xml | 22 + nutch-plugins/urlmeta/ivy.xml | 41 + nutch-plugins/urlmeta/plugin.xml | 47 + nutch-plugins/urlmeta/pom.xml | 38 + .../indexer/urlmeta/URLMetaIndexingFilter.java | 118 +++ .../apache/nutch/indexer/urlmeta/package.html | 12 + .../scoring/urlmeta/URLMetaScoringFilter.java | 175 ++++ .../apache/nutch/scoring/urlmeta/package.html | 11 + nutch-plugins/urlnormalizer-ajax/build.xml | 22 + nutch-plugins/urlnormalizer-ajax/ivy.xml | 41 + nutch-plugins/urlnormalizer-ajax/plugin.xml | 41 + nutch-plugins/urlnormalizer-ajax/pom.xml | 38 + .../urlnormalizer/ajax/AjaxURLNormalizer.java | 236 +++++ .../ajax/TestAjaxURLNormalizer.java | 67 ++ nutch-plugins/urlnormalizer-basic/build.xml | 22 + nutch-plugins/urlnormalizer-basic/ivy.xml | 41 + nutch-plugins/urlnormalizer-basic/plugin.xml | 41 + nutch-plugins/urlnormalizer-basic/pom.xml | 38 + .../urlnormalizer/basic/BasicURLNormalizer.java | 290 ++++++ .../net/urlnormalizer/basic/package-info.java | 23 + .../basic/TestBasicURLNormalizer.java | 175 ++++ nutch-plugins/urlnormalizer-host/build.xml | 27 + nutch-plugins/urlnormalizer-host/ivy.xml | 41 + nutch-plugins/urlnormalizer-host/plugin.xml | 43 + nutch-plugins/urlnormalizer-host/pom.xml | 38 + .../urlnormalizer/host/HostURLNormalizer.java | 198 +++++ .../net/urlnormalizer/host/package-info.java | 23 + .../host/TestHostURLNormalizer.java | 57 ++ .../src/test/resources/hosts.txt | 8 + nutch-plugins/urlnormalizer-pass/build.xml | 22 + nutch-plugins/urlnormalizer-pass/ivy.xml | 41 + nutch-plugins/urlnormalizer-pass/plugin.xml | 41 + nutch-plugins/urlnormalizer-pass/pom.xml | 38 + .../urlnormalizer/pass/PassURLNormalizer.java | 49 + .../net/urlnormalizer/pass/package-info.java | 23 + .../pass/TestPassURLNormalizer.java | 45 + nutch-plugins/urlnormalizer-protocol/build.xml | 27 + nutch-plugins/urlnormalizer-protocol/ivy.xml | 41 + nutch-plugins/urlnormalizer-protocol/plugin.xml | 43 + nutch-plugins/urlnormalizer-protocol/pom.xml | 38 + .../protocol/ProtocolURLNormalizer.java | 190 ++++ .../protocol/TestProtocolURLNormalizer.java | 55 ++ .../src/test/resources/protocols.txt | 7 + .../urlnormalizer-querystring/build.xml | 22 + nutch-plugins/urlnormalizer-querystring/ivy.xml | 41 + .../urlnormalizer-querystring/plugin.xml | 42 + nutch-plugins/urlnormalizer-querystring/pom.xml | 38 + .../querystring/QuerystringURLNormalizer.java | 91 ++ .../urlnormalizer/querystring/package-info.java | 23 + .../TestQuerystringURLNormalizer.java | 49 + nutch-plugins/urlnormalizer-regex/build.xml | 34 + nutch-plugins/urlnormalizer-regex/ivy.xml | 41 + nutch-plugins/urlnormalizer-regex/plugin.xml | 41 + nutch-plugins/urlnormalizer-regex/pom.xml | 38 + .../urlnormalizer/regex/RegexURLNormalizer.java | 324 +++++++ .../net/urlnormalizer/regex/package-info.java | 23 + .../regex/TestRegexURLNormalizer.java | 186 ++++ .../test/resources/regex-normalize-default.test | 84 ++ .../test/resources/regex-normalize-default.xml | 66 ++ .../test/resources/regex-normalize-scope1.test | 8 + .../test/resources/regex-normalize-scope1.xml | 21 + nutch-plugins/urlnormalizer-slash/build.xml | 27 + nutch-plugins/urlnormalizer-slash/ivy.xml | 41 + nutch-plugins/urlnormalizer-slash/plugin.xml | 43 + nutch-plugins/urlnormalizer-slash/pom.xml | 38 + .../urlnormalizer/slash/SlashURLNormalizer.java | 224 +++++ .../slash/TestSlashURLNormalizer.java | 73 ++ .../src/test/resources/slashes.txt | 7 + pom.xml | 157 ++++ src/bin/crawl | 281 ------ src/bin/nutch | 324 ------- .../nutch/crawl/AbstractFetchSchedule.java | 227 ----- .../nutch/crawl/AdaptiveFetchSchedule.java | 203 ----- src/java/org/apache/nutch/crawl/CrawlDatum.java | 572 ------------ src/java/org/apache/nutch/crawl/CrawlDb.java | 349 -------- .../org/apache/nutch/crawl/CrawlDbFilter.java | 111 --- .../org/apache/nutch/crawl/CrawlDbMerger.java | 216 ----- .../org/apache/nutch/crawl/CrawlDbReader.java | 887 ------------------- .../org/apache/nutch/crawl/CrawlDbReducer.java | 339 ------- .../apache/nutch/crawl/DeduplicationJob.java | 389 -------- .../nutch/crawl/DefaultFetchSchedule.java | 45 - .../org/apache/nutch/crawl/FetchSchedule.java | 208 ----- .../nutch/crawl/FetchScheduleFactory.java | 53 -- src/java/org/apache/nutch/crawl/Generator.java | 859 ------------------ src/java/org/apache/nutch/crawl/Injector.java | 510 ----------- src/java/org/apache/nutch/crawl/Inlink.java | 83 -- src/java/org/apache/nutch/crawl/Inlinks.java | 110 --- src/java/org/apache/nutch/crawl/LinkDb.java | 428 --------- .../org/apache/nutch/crawl/LinkDbFilter.java | 128 --- .../org/apache/nutch/crawl/LinkDbMerger.java | 204 ----- .../org/apache/nutch/crawl/LinkDbReader.java | 203 ----- .../org/apache/nutch/crawl/MD5Signature.java | 39 - .../nutch/crawl/MimeAdaptiveFetchSchedule.java | 236 ----- .../org/apache/nutch/crawl/NutchWritable.java | 66 -- src/java/org/apache/nutch/crawl/Signature.java | 37 - .../apache/nutch/crawl/SignatureComparator.java | 57 -- .../apache/nutch/crawl/SignatureFactory.java | 62 -- .../apache/nutch/crawl/TextMD5Signature.java | 42 - .../nutch/crawl/TextProfileSignature.java | 199 ----- .../org/apache/nutch/crawl/URLPartitioner.java | 96 -- src/java/org/apache/nutch/crawl/package.html | 5 - .../org/apache/nutch/fetcher/FetchItem.java | 118 --- .../apache/nutch/fetcher/FetchItemQueue.java | 139 --- .../apache/nutch/fetcher/FetchItemQueues.java | 212 ----- .../org/apache/nutch/fetcher/FetchNode.java | 59 -- .../org/apache/nutch/fetcher/FetchNodeDb.java | 49 - src/java/org/apache/nutch/fetcher/Fetcher.java | 600 ------------- .../nutch/fetcher/FetcherOutputFormat.java | 123 --- .../org/apache/nutch/fetcher/FetcherThread.java | 768 ---------------- .../org/apache/nutch/fetcher/QueueFeeder.java | 104 --- src/java/org/apache/nutch/fetcher/package.html | 5 - src/java/org/apache/nutch/hostdb/HostDatum.java | 324 ------- .../org/apache/nutch/hostdb/ReadHostDb.java | 240 ----- .../org/apache/nutch/hostdb/ResolverThread.java | 121 --- .../org/apache/nutch/hostdb/UpdateHostDb.java | 259 ------ .../apache/nutch/hostdb/UpdateHostDbMapper.java | 239 ----- .../nutch/hostdb/UpdateHostDbReducer.java | 427 --------- .../org/apache/nutch/indexer/CleaningJob.java | 210 ----- .../org/apache/nutch/indexer/IndexWriter.java | 47 - .../org/apache/nutch/indexer/IndexWriters.java | 145 --- .../apache/nutch/indexer/IndexerMapReduce.java | 422 --------- .../nutch/indexer/IndexerOutputFormat.java | 57 -- .../apache/nutch/indexer/IndexingException.java | 39 - .../apache/nutch/indexer/IndexingFilter.java | 61 -- .../apache/nutch/indexer/IndexingFilters.java | 60 -- .../nutch/indexer/IndexingFiltersChecker.java | 371 -------- .../org/apache/nutch/indexer/IndexingJob.java | 358 -------- .../org/apache/nutch/indexer/NutchDocument.java | 144 --- .../org/apache/nutch/indexer/NutchField.java | 137 --- .../apache/nutch/indexer/NutchIndexAction.java | 58 -- src/java/org/apache/nutch/indexer/package.html | 10 - .../apache/nutch/metadata/CreativeCommons.java | 35 - .../org/apache/nutch/metadata/DublinCore.java | 161 ---- src/java/org/apache/nutch/metadata/Feed.java | 38 - .../org/apache/nutch/metadata/HttpHeaders.java | 51 -- .../org/apache/nutch/metadata/MetaWrapper.java | 120 --- .../org/apache/nutch/metadata/Metadata.java | 280 ------ src/java/org/apache/nutch/metadata/Nutch.java | 98 -- .../nutch/metadata/SpellCheckedMetadata.java | 150 ---- src/java/org/apache/nutch/metadata/package.html | 6 - .../apache/nutch/net/URLExemptionFilter.java | 43 - .../apache/nutch/net/URLExemptionFilters.java | 64 -- src/java/org/apache/nutch/net/URLFilter.java | 40 - .../org/apache/nutch/net/URLFilterChecker.java | 134 --- .../apache/nutch/net/URLFilterException.java | 39 - src/java/org/apache/nutch/net/URLFilters.java | 44 - .../org/apache/nutch/net/URLNormalizer.java | 37 - .../apache/nutch/net/URLNormalizerChecker.java | 117 --- .../org/apache/nutch/net/URLNormalizers.java | 325 ------- src/java/org/apache/nutch/net/package-info.java | 23 - .../nutch/net/protocols/HttpDateFormat.java | 124 --- .../nutch/net/protocols/ProtocolException.java | 47 - .../apache/nutch/net/protocols/Response.java | 46 - .../nutch/net/protocols/package-info.java | 23 - .../org/apache/nutch/parse/HTMLMetaTags.java | 203 ----- .../org/apache/nutch/parse/HtmlParseFilter.java | 45 - .../apache/nutch/parse/HtmlParseFilters.java | 62 -- src/java/org/apache/nutch/parse/Outlink.java | 135 --- .../apache/nutch/parse/OutlinkExtractor.java | 145 --- src/java/org/apache/nutch/parse/Parse.java | 38 - .../org/apache/nutch/parse/ParseCallable.java | 37 - src/java/org/apache/nutch/parse/ParseData.java | 255 ------ .../org/apache/nutch/parse/ParseException.java | 39 - src/java/org/apache/nutch/parse/ParseImpl.java | 87 -- .../apache/nutch/parse/ParseOutputFormat.java | 398 --------- .../org/apache/nutch/parse/ParsePluginList.java | 71 -- .../apache/nutch/parse/ParsePluginsReader.java | 278 ------ .../org/apache/nutch/parse/ParseResult.java | 178 ---- .../org/apache/nutch/parse/ParseSegment.java | 309 ------- .../org/apache/nutch/parse/ParseStatus.java | 311 ------- src/java/org/apache/nutch/parse/ParseText.java | 119 --- src/java/org/apache/nutch/parse/ParseUtil.java | 181 ---- src/java/org/apache/nutch/parse/Parser.java | 58 -- .../org/apache/nutch/parse/ParserChecker.java | 270 ------ .../org/apache/nutch/parse/ParserFactory.java | 428 --------- .../org/apache/nutch/parse/ParserNotFound.java | 47 - .../org/apache/nutch/parse/package-info.java | 22 - .../plugin/CircularDependencyException.java | 36 - src/java/org/apache/nutch/plugin/Extension.java | 194 ---- .../org/apache/nutch/plugin/ExtensionPoint.java | 123 --- .../plugin/MissingDependencyException.java | 36 - src/java/org/apache/nutch/plugin/Pluggable.java | 31 - src/java/org/apache/nutch/plugin/Plugin.java | 95 -- .../apache/nutch/plugin/PluginClassLoader.java | 80 -- .../apache/nutch/plugin/PluginDescriptor.java | 363 -------- .../nutch/plugin/PluginManifestParser.java | 303 ------- .../apache/nutch/plugin/PluginRepository.java | 523 ----------- .../nutch/plugin/PluginRuntimeException.java | 37 - src/java/org/apache/nutch/plugin/package.html | 40 - src/java/org/apache/nutch/protocol/Content.java | 296 ------- .../org/apache/nutch/protocol/Protocol.java | 68 -- .../nutch/protocol/ProtocolException.java | 39 - .../apache/nutch/protocol/ProtocolFactory.java | 119 --- .../apache/nutch/protocol/ProtocolNotFound.java | 36 - .../apache/nutch/protocol/ProtocolOutput.java | 55 -- .../apache/nutch/protocol/ProtocolStatus.java | 297 ------- .../apache/nutch/protocol/RobotRulesParser.java | 325 ------- .../org/apache/nutch/protocol/package-info.java | 23 - .../nutch/scoring/AbstractScoringFilter.java | 68 -- .../org/apache/nutch/scoring/ScoringFilter.java | 213 ----- .../nutch/scoring/ScoringFilterException.java | 43 - .../apache/nutch/scoring/ScoringFilters.java | 118 --- .../org/apache/nutch/scoring/package-info.java | 22 - .../nutch/scoring/webgraph/LinkDatum.java | 140 --- .../nutch/scoring/webgraph/LinkDumper.java | 433 --------- .../apache/nutch/scoring/webgraph/LinkRank.java | 677 -------------- .../org/apache/nutch/scoring/webgraph/Node.java | 102 --- .../nutch/scoring/webgraph/NodeDumper.java | 433 --------- .../nutch/scoring/webgraph/NodeReader.java | 136 --- .../nutch/scoring/webgraph/ScoreUpdater.java | 253 ------ .../apache/nutch/scoring/webgraph/WebGraph.java | 783 ---------------- .../nutch/scoring/webgraph/package-info.java | 24 - .../nutch/segment/ContentAsTextInputFormat.java | 104 --- .../apache/nutch/segment/SegmentChecker.java | 136 --- .../nutch/segment/SegmentMergeFilter.java | 47 - .../nutch/segment/SegmentMergeFilters.java | 84 -- .../org/apache/nutch/segment/SegmentMerger.java | 793 ----------------- .../org/apache/nutch/segment/SegmentPart.java | 113 --- .../org/apache/nutch/segment/SegmentReader.java | 719 --------------- .../org/apache/nutch/segment/package-info.java | 23 - .../org/apache/nutch/service/ConfManager.java | 39 - .../org/apache/nutch/service/JobManager.java | 44 - .../org/apache/nutch/service/NutchReader.java | 37 - .../org/apache/nutch/service/NutchServer.java | 224 ----- .../nutch/service/impl/ConfManagerImpl.java | 132 --- .../apache/nutch/service/impl/JobFactory.java | 75 -- .../nutch/service/impl/JobManagerImpl.java | 95 -- .../apache/nutch/service/impl/JobWorker.java | 114 --- .../apache/nutch/service/impl/LinkReader.java | 175 ---- .../apache/nutch/service/impl/NodeReader.java | 184 ---- .../service/impl/NutchServerPoolExecutor.java | 131 --- .../nutch/service/impl/SequenceReader.java | 171 ---- .../nutch/service/model/request/DbQuery.java | 56 -- .../nutch/service/model/request/JobConfig.java | 71 -- .../service/model/request/NutchConfig.java | 51 -- .../service/model/request/ReaderConfig.java | 30 - .../nutch/service/model/request/SeedList.java | 93 -- .../nutch/service/model/request/SeedUrl.java | 89 -- .../service/model/response/FetchNodeDbInfo.java | 103 --- .../nutch/service/model/response/JobInfo.java | 102 --- .../service/model/response/NutchServerInfo.java | 55 -- .../service/resources/AbstractResource.java | 45 - .../nutch/service/resources/AdminResource.java | 85 -- .../nutch/service/resources/ConfigResource.java | 137 --- .../nutch/service/resources/DbResource.java | 143 --- .../nutch/service/resources/JobResource.java | 99 --- .../nutch/service/resources/ReaderResouce.java | 177 ---- .../nutch/service/resources/SeedResource.java | 111 --- .../nutch/tools/AbstractCommonCrawlFormat.java | 393 -------- src/java/org/apache/nutch/tools/Benchmark.java | 284 ------ .../apache/nutch/tools/CommonCrawlConfig.java | 147 --- .../nutch/tools/CommonCrawlDataDumper.java | 716 --------------- .../apache/nutch/tools/CommonCrawlFormat.java | 87 -- .../nutch/tools/CommonCrawlFormatFactory.java | 74 -- .../nutch/tools/CommonCrawlFormatJackson.java | 109 --- .../nutch/tools/CommonCrawlFormatJettinson.java | 122 --- .../nutch/tools/CommonCrawlFormatSimple.java | 174 ---- .../nutch/tools/CommonCrawlFormatWARC.java | 286 ------ src/java/org/apache/nutch/tools/DmozParser.java | 391 -------- src/java/org/apache/nutch/tools/FileDumper.java | 419 --------- .../org/apache/nutch/tools/FreeGenerator.java | 214 ----- .../org/apache/nutch/tools/ResolveUrls.java | 204 ----- src/java/org/apache/nutch/tools/WARCUtils.java | 154 ---- .../apache/nutch/tools/arc/ArcInputFormat.java | 51 -- .../apache/nutch/tools/arc/ArcRecordReader.java | 299 ------- .../nutch/tools/arc/ArcSegmentCreator.java | 426 --------- .../apache/nutch/tools/arc/package-info.java | 23 - .../org/apache/nutch/tools/package-info.java | 22 - .../apache/nutch/tools/warc/WARCExporter.java | 333 ------- .../apache/nutch/tools/warc/package-info.java | 23 - .../org/apache/nutch/util/CommandRunner.java | 291 ------ .../apache/nutch/util/CrawlCompletionStats.java | 245 ----- .../org/apache/nutch/util/DeflateUtils.java | 140 --- src/java/org/apache/nutch/util/DomUtil.java | 104 --- .../org/apache/nutch/util/DumpFileUtil.java | 147 --- .../org/apache/nutch/util/EncodingDetector.java | 386 -------- src/java/org/apache/nutch/util/FSUtils.java | 106 --- src/java/org/apache/nutch/util/GZIPUtils.java | 148 ---- .../nutch/util/GenericWritableConfigurable.java | 60 -- .../org/apache/nutch/util/HadoopFSUtil.java | 72 -- src/java/org/apache/nutch/util/JexlUtil.java | 76 -- src/java/org/apache/nutch/util/LockUtil.java | 84 -- src/java/org/apache/nutch/util/MimeUtil.java | 279 ------ src/java/org/apache/nutch/util/NodeWalker.java | 129 --- .../apache/nutch/util/NutchConfiguration.java | 104 --- src/java/org/apache/nutch/util/NutchJob.java | 30 - src/java/org/apache/nutch/util/NutchTool.java | 109 --- src/java/org/apache/nutch/util/ObjectCache.java | 56 -- .../apache/nutch/util/PrefixStringMatcher.java | 119 --- .../nutch/util/ProtocolStatusStatistics.java | 179 ---- src/java/org/apache/nutch/util/StringUtil.java | 155 ---- .../apache/nutch/util/SuffixStringMatcher.java | 114 --- src/java/org/apache/nutch/util/TableUtil.java | 161 ---- src/java/org/apache/nutch/util/TimingUtil.java | 72 -- .../apache/nutch/util/TrieStringMatcher.java | 202 ----- src/java/org/apache/nutch/util/URLUtil.java | 533 ----------- .../nutch/util/domain/DomainStatistics.java | 234 ----- .../apache/nutch/util/domain/DomainSuffix.java | 79 -- .../nutch/util/domain/DomainSuffixes.java | 86 -- .../nutch/util/domain/DomainSuffixesReader.java | 164 ---- .../nutch/util/domain/TopLevelDomain.java | 67 -- .../org/apache/nutch/util/domain/package.html | 14 - .../org/apache/nutch/util/package-info.java | 22 - .../apache/nutch/webui/NutchUiApplication.java | 75 -- .../nutch/webui/NutchUiApplication.properties | 63 -- .../org/apache/nutch/webui/NutchUiServer.java | 104 --- .../apache/nutch/webui/client/NutchClient.java | 49 - .../nutch/webui/client/NutchClientFactory.java | 52 -- .../nutch/webui/client/impl/CrawlingCycle.java | 82 -- .../client/impl/CrawlingCycleListener.java | 31 - .../webui/client/impl/NutchClientImpl.java | 99 --- .../nutch/webui/client/impl/RemoteCommand.java | 76 -- .../webui/client/impl/RemoteCommandBuilder.java | 64 -- .../client/impl/RemoteCommandExecutor.java | 110 --- .../client/impl/RemoteCommandsBatchFactory.java | 97 -- .../webui/client/model/ConnectionStatus.java | 21 - .../apache/nutch/webui/client/model/Crawl.java | 126 --- .../nutch/webui/client/model/JobConfig.java | 77 -- .../nutch/webui/client/model/JobInfo.java | 104 --- .../nutch/webui/client/model/NutchStatus.java | 62 -- .../nutch/webui/config/CustomDaoFactory.java | 58 -- .../nutch/webui/config/CustomTableCreator.java | 83 -- .../webui/config/NutchGuiConfiguration.java | 33 - .../nutch/webui/config/SpringConfiguration.java | 91 -- .../apache/nutch/webui/model/NutchConfig.java | 24 - .../apache/nutch/webui/model/NutchInstance.java | 118 --- .../org/apache/nutch/webui/model/SeedList.java | 106 --- .../org/apache/nutch/webui/model/SeedUrl.java | 96 -- .../nutch/webui/pages/AbstractBasePage.html | 33 - .../nutch/webui/pages/AbstractBasePage.java | 206 ----- .../apache/nutch/webui/pages/DashboardPage.html | 52 -- .../apache/nutch/webui/pages/DashboardPage.java | 65 -- .../apache/nutch/webui/pages/LogOutPage.java | 21 - .../nutch/webui/pages/SchedulingPage.java | 21 - .../apache/nutch/webui/pages/SearchPage.java | 21 - .../nutch/webui/pages/StatisticsPage.java | 21 - .../nutch/webui/pages/UrlsUploadPage.java | 21 - .../nutch/webui/pages/UserSettingsPage.java | 21 - .../webui/pages/assets/NutchUiCssReference.java | 39 - .../nutch/webui/pages/assets/nutch-style.css | 149 ---- .../webui/pages/components/ColorEnumLabel.java | 71 -- .../pages/components/ColorEnumLabelBuilder.java | 49 - .../pages/components/CpmIteratorAdapter.java | 41 - .../nutch/webui/pages/crawls/CrawlPanel.html | 58 -- .../nutch/webui/pages/crawls/CrawlPanel.java | 98 -- .../nutch/webui/pages/crawls/CrawlsPage.html | 90 -- .../nutch/webui/pages/crawls/CrawlsPage.java | 139 --- .../webui/pages/instances/InstancePanel.html | 46 - .../webui/pages/instances/InstancePanel.java | 62 -- .../webui/pages/instances/InstancesPage.html | 66 -- .../webui/pages/instances/InstancesPage.java | 127 --- .../nutch/webui/pages/menu/VerticalMenu.html | 48 - .../nutch/webui/pages/menu/VerticalMenu.java | 27 - .../nutch/webui/pages/seed/SeedListsPage.html | 75 -- .../nutch/webui/pages/seed/SeedListsPage.java | 79 -- .../apache/nutch/webui/pages/seed/SeedPage.html | 91 -- .../apache/nutch/webui/pages/seed/SeedPage.java | 153 ---- .../webui/pages/settings/SettingsPage.html | 43 - .../webui/pages/settings/SettingsPage.java | 59 -- .../nutch/webui/service/CrawlService.java | 33 - .../webui/service/NutchInstanceService.java | 33 - .../nutch/webui/service/NutchService.java | 31 - .../nutch/webui/service/SeedListService.java | 33 - .../webui/service/impl/CrawlServiceImpl.java | 132 --- .../service/impl/NutchInstanceServiceImpl.java | 76 -- .../webui/service/impl/NutchServiceImpl.java | 82 -- .../webui/service/impl/SeedListServiceImpl.java | 77 -- src/java/overview.html | 9 - src/plugin/build-plugin.xml | 255 ------ src/plugin/build.xml | 213 ----- src/plugin/creativecommons/README.txt | 1 - src/plugin/creativecommons/build.xml | 28 - .../creativecommons/conf/crawl-urlfilter.txt | 18 - src/plugin/creativecommons/conf/nutch-site.xml | 50 -- src/plugin/creativecommons/data/anchor.html | 9 - src/plugin/creativecommons/data/rdf.html | 35 - src/plugin/creativecommons/data/rel.html | 6 - src/plugin/creativecommons/ivy.xml | 41 - src/plugin/creativecommons/plugin.xml | 48 - .../creativecommons/nutch/CCIndexingFilter.java | 124 --- .../creativecommons/nutch/CCParseFilter.java | 300 ------- .../java/org/creativecommons/nutch/package.html | 5 - .../nutch/TestCCParseFilter.java | 73 -- src/plugin/feed/build.xml | 45 - src/plugin/feed/ivy.xml | 43 - src/plugin/feed/plugin.xml | 49 - src/plugin/feed/sample/rsstest.rss | 36 - .../nutch/indexer/feed/FeedIndexingFilter.java | 129 --- .../apache/nutch/indexer/feed/package-info.java | 22 - .../org/apache/nutch/parse/feed/FeedParser.java | 374 -------- .../apache/nutch/parse/feed/package-info.java | 22 - .../apache/nutch/parse/feed/TestFeedParser.java | 124 --- src/plugin/headings/build.xml | 22 - src/plugin/headings/ivy.xml | 41 - src/plugin/headings/plugin.xml | 45 - .../parse/headings/HeadingsParseFilter.java | 124 --- .../nutch/parse/headings/package-info.java | 22 - src/plugin/index-anchor/build.xml | 22 - src/plugin/index-anchor/ivy.xml | 41 - src/plugin/index-anchor/plugin.xml | 38 - .../indexer/anchor/AnchorIndexingFilter.java | 107 --- .../apache/nutch/indexer/anchor/package.html | 5 - .../anchor/TestAnchorIndexingFilter.java | 67 -- src/plugin/index-basic/build.xml | 22 - src/plugin/index-basic/ivy.xml | 41 - src/plugin/index-basic/plugin.xml | 42 - .../indexer/basic/BasicIndexingFilter.java | 158 ---- .../org/apache/nutch/indexer/basic/package.html | 5 - .../indexer/basic/TestBasicIndexingFilter.java | 99 --- src/plugin/index-geoip/build-ivy.xml | 54 -- src/plugin/index-geoip/build.xml | 27 - src/plugin/index-geoip/ivy.xml | 46 - src/plugin/index-geoip/plugin.xml | 51 -- .../indexer/geoip/GeoIPDocumentCreator.java | 210 ----- .../indexer/geoip/GeoIPIndexingFilter.java | 241 ----- .../nutch/indexer/geoip/package-info.java | 28 - src/plugin/index-links/build.xml | 22 - src/plugin/index-links/ivy.xml | 41 - src/plugin/index-links/plugin.xml | 41 - .../indexer/links/LinksIndexingFilter.java | 167 ---- .../indexer/links/TestLinksIndexingFilter.java | 218 ----- .../org/apache/nutch/parse/TestOutlinks.java | 54 -- src/plugin/index-metadata/build.xml | 22 - src/plugin/index-metadata/ivy.xml | 41 - src/plugin/index-metadata/plugin.xml | 42 - .../nutch/indexer/metadata/MetadataIndexer.java | 104 --- .../nutch/indexer/metadata/package-info.java | 23 - src/plugin/index-more/build.xml | 22 - src/plugin/index-more/ivy.xml | 41 - src/plugin/index-more/plugin.xml | 42 - .../nutch/indexer/more/MoreIndexingFilter.java | 344 ------- .../org/apache/nutch/indexer/more/package.html | 6 - .../indexer/more/TestMoreIndexingFilter.java | 123 --- src/plugin/index-replace/README.txt | 95 -- src/plugin/index-replace/build.xml | 55 -- src/plugin/index-replace/ivy.xml | 41 - src/plugin/index-replace/plugin.xml | 22 - .../index-replace/sample/testIndexReplace.html | 12 - .../nutch/indexer/replace/FieldReplacer.java | 196 ---- .../nutch/indexer/replace/ReplaceIndexer.java | 330 ------- .../nutch/indexer/replace/package-info.java | 22 - .../nutch/indexer/replace/TestIndexReplace.java | 456 ---------- src/plugin/index-static/build.xml | 22 - src/plugin/index-static/ivy.xml | 41 - src/plugin/index-static/plugin.xml | 42 - .../indexer/staticfield/StaticFieldIndexer.java | 143 --- .../nutch/indexer/staticfield/package.html | 5 - .../staticfield/TestStaticFieldIndexerTest.java | 194 ---- src/plugin/indexer-cloudsearch/README.md | 58 -- src/plugin/indexer-cloudsearch/build.xml | 22 - .../indexer-cloudsearch/createCSDomain.sh | 22 - src/plugin/indexer-cloudsearch/ivy.xml | 41 - src/plugin/indexer-cloudsearch/plugin.xml | 50 -- .../cloudsearch/CloudSearchConstants.java | 27 - .../cloudsearch/CloudSearchIndexWriter.java | 382 -------- .../cloudsearch/CloudSearchUtils.java | 73 -- src/plugin/indexer-dummy/build.xml | 22 - src/plugin/indexer-dummy/ivy.xml | 41 - src/plugin/indexer-dummy/plugin.xml | 38 - .../indexwriter/dummy/DummyIndexWriter.java | 103 --- .../nutch/indexwriter/dummy/package-info.java | 23 - src/plugin/indexer-elastic/build-ivy.xml | 54 -- src/plugin/indexer-elastic/build.xml | 22 - src/plugin/indexer-elastic/howto_upgrade_es.txt | 6 - src/plugin/indexer-elastic/ivy.xml | 43 - src/plugin/indexer-elastic/plugin.xml | 71 -- .../indexwriter/elastic/ElasticConstants.java | 28 - .../indexwriter/elastic/ElasticIndexWriter.java | 279 ------ .../nutch/indexwriter/elastic/package-info.java | 22 - src/plugin/indexer-solr/build-ivy.xml | 54 -- src/plugin/indexer-solr/build.xml | 22 - src/plugin/indexer-solr/ivy.xml | 44 - src/plugin/indexer-solr/plugin.xml | 48 - .../nutch/indexwriter/solr/SolrConstants.java | 56 -- .../nutch/indexwriter/solr/SolrIndexWriter.java | 277 ------ .../indexwriter/solr/SolrMappingReader.java | 147 --- .../nutch/indexwriter/solr/SolrUtils.java | 97 -- .../nutch/indexwriter/solr/package-info.java | 22 - src/plugin/language-identifier/build.xml | 38 - src/plugin/language-identifier/ivy.xml | 41 - src/plugin/language-identifier/plugin.xml | 49 - .../nutch/analysis/lang/HTMLLanguageParser.java | 320 ------- .../analysis/lang/LanguageIndexingFilter.java | 89 -- .../nutch/analysis/lang/langmappings.properties | 188 ---- .../org/apache/nutch/analysis/lang/package.html | 6 - .../analysis/lang/TestHTMLLanguageParser.java | 149 ---- .../test/org/apache/nutch/analysis/lang/da.test | 108 --- .../test/org/apache/nutch/analysis/lang/de.test | 104 --- .../test/org/apache/nutch/analysis/lang/el.test | 109 --- .../test/org/apache/nutch/analysis/lang/en.test | 105 --- .../test/org/apache/nutch/analysis/lang/es.test | 107 --- .../test/org/apache/nutch/analysis/lang/fi.test | 106 --- .../test/org/apache/nutch/analysis/lang/fr.test | 105 --- .../test/org/apache/nutch/analysis/lang/it.test | 109 --- .../test/org/apache/nutch/analysis/lang/nl.test | 105 --- .../test/org/apache/nutch/analysis/lang/pt.test | 105 --- .../test/org/apache/nutch/analysis/lang/sv.test | 108 --- .../nutch/analysis/lang/test-referencial.txt | 10 - src/plugin/lib-htmlunit/build-ivy.xml | 54 -- src/plugin/lib-htmlunit/build.xml | 28 - src/plugin/lib-htmlunit/ivy.xml | 52 -- src/plugin/lib-htmlunit/plugin.xml | 166 ---- .../protocol/htmlunit/HtmlUnitWebDriver.java | 189 ---- .../htmlunit/HtmlUnitWebWindowListener.java | 53 -- src/plugin/lib-http/build.xml | 22 - src/plugin/lib-http/ivy.xml | 41 - src/plugin/lib-http/plugin.xml | 33 - .../protocol/http/api/BlockedException.java | 26 - .../nutch/protocol/http/api/HttpBase.java | 587 ------------ .../nutch/protocol/http/api/HttpException.java | 40 - .../protocol/http/api/HttpRobotRulesParser.java | 167 ---- .../apache/nutch/protocol/http/api/package.html | 6 - .../protocol/http/api/TestRobotRulesParser.java | 123 --- src/plugin/lib-nekohtml/build.xml | 30 - src/plugin/lib-nekohtml/ivy.xml | 42 - src/plugin/lib-nekohtml/plugin.xml | 38 - src/plugin/lib-regex-filter/build.xml | 22 - src/plugin/lib-regex-filter/ivy.xml | 41 - src/plugin/lib-regex-filter/plugin.xml | 33 - .../apache/nutch/urlfilter/api/RegexRule.java | 102 --- .../nutch/urlfilter/api/RegexURLFilterBase.java | 315 ------- .../nutch/urlfilter/api/package-info.java | 23 - .../urlfilter/api/RegexURLFilterBaseTest.java | 134 --- src/plugin/lib-selenium/build-ivy.xml | 54 -- src/plugin/lib-selenium/build.xml | 28 - .../lib-selenium/howto_upgrade_selenium.txt | 15 - src/plugin/lib-selenium/ivy.xml | 52 -- src/plugin/lib-selenium/plugin.xml | 175 ---- .../nutch/protocol/selenium/HttpWebClient.java | 236 ----- src/plugin/lib-xml/build.xml | 36 - src/plugin/lib-xml/ivy.xml | 44 - src/plugin/lib-xml/plugin.xml | 65 -- src/plugin/microformats-reltag/build.xml | 27 - src/plugin/microformats-reltag/ivy.xml | 41 - src/plugin/microformats-reltag/plugin.xml | 49 - .../reltag/RelTagIndexingFilter.java | 77 -- .../nutch/microformats/reltag/RelTagParser.java | 148 ---- .../nutch/microformats/reltag/package.html | 8 - src/plugin/mimetype-filter/build.xml | 28 - src/plugin/mimetype-filter/ivy.xml | 41 - src/plugin/mimetype-filter/plugin.xml | 37 - .../mimetype-filter/sample/allow-images.txt | 34 - .../mimetype-filter/sample/block-html.txt | 34 - .../indexer/filter/MimeTypeIndexingFilter.java | 273 ------ .../filter/MimeTypeIndexingFilterTest.java | 114 --- src/plugin/nutch-extensionpoints/build.xml | 30 - src/plugin/nutch-extensionpoints/ivy.xml | 41 - src/plugin/nutch-extensionpoints/plugin.xml | 67 -- src/plugin/parse-ext/build.xml | 32 - src/plugin/parse-ext/command | 24 - src/plugin/parse-ext/ivy.xml | 41 - src/plugin/parse-ext/plugin.xml | 60 -- .../org/apache/nutch/parse/ext/ExtParser.java | 183 ---- .../apache/nutch/parse/ext/package-info.java | 22 - .../apache/nutch/parse/ext/TestExtParser.java | 130 --- src/plugin/parse-html/build.xml | 40 - src/plugin/parse-html/ivy.xml | 42 - src/plugin/parse-html/plugin.xml | 48 - .../org/apache/nutch/parse/html/DOMBuilder.java | 766 ---------------- .../nutch/parse/html/DOMContentUtils.java | 400 --------- .../nutch/parse/html/HTMLMetaProcessor.java | 214 ----- .../org/apache/nutch/parse/html/HtmlParser.java | 352 -------- .../parse/html/XMLCharacterRecognizer.java | 112 --- .../org/apache/nutch/parse/html/package.html | 5 - .../nutch/parse/html/TestDOMContentUtils.java | 347 -------- .../apache/nutch/parse/html/TestHtmlParser.java | 122 --- .../parse/html/TestRobotsMetaProcessor.java | 155 ---- src/plugin/parse-js/build.xml | 22 - src/plugin/parse-js/ivy.xml | 41 - src/plugin/parse-js/plugin.xml | 53 -- .../apache/nutch/parse/js/JSParseFilter.java | 301 ------- .../org/apache/nutch/parse/js/package-info.java | 23 - src/plugin/parse-metatags/README.txt | 17 - src/plugin/parse-metatags/build.xml | 37 - src/plugin/parse-metatags/ivy.xml | 41 - src/plugin/parse-metatags/plugin.xml | 22 - .../parse-metatags/sample/testMetatags.html | 9 - .../sample/testMultivalueMetatags.html | 12 - .../nutch/parse/metatags/MetaTagsParser.java | 124 --- .../nutch/parse/metatags/package-info.java | 24 - .../nutch/parse/metatags/TestMetatagParser.java | 104 --- src/plugin/parse-replace/README.txt | 91 -- src/plugin/parse-replace/build.xml | 37 - src/plugin/parse-replace/ivy.xml | 41 - src/plugin/parse-replace/plugin.xml | 22 - .../parse-replace/sample/testParseReplace.html | 11 - .../nutch/parse/replace/ReplaceParser.java | 74 -- .../nutch/parse/replace/package-info.java | 22 - .../nutch/parse/replace/TestParseReplace.java | 68 -- src/plugin/parse-swf/build.xml | 38 - src/plugin/parse-swf/ivy.xml | 41 - src/plugin/parse-swf/lib/javaswf-LICENSE.txt | 33 - src/plugin/parse-swf/lib/javaswf.jar | Bin 125369 -> 0 bytes src/plugin/parse-swf/plugin.xml | 44 - src/plugin/parse-swf/sample/test1.swf | Bin 21054 -> 0 bytes src/plugin/parse-swf/sample/test1.txt | 60 -- src/plugin/parse-swf/sample/test2.swf | Bin 42534 -> 0 bytes src/plugin/parse-swf/sample/test2.txt | 5 - src/plugin/parse-swf/sample/test3.swf | Bin 51562 -> 0 bytes src/plugin/parse-swf/sample/test3.txt | 11 - .../org/apache/nutch/parse/swf/SWFParser.java | 685 -------------- .../apache/nutch/parse/swf/package-info.java | 22 - .../apache/nutch/parse/swf/TestSWFParser.java | 94 -- src/plugin/parse-tika/build-ivy.xml | 54 -- src/plugin/parse-tika/build.xml | 55 -- src/plugin/parse-tika/howto_upgrade_tika.txt | 8 - src/plugin/parse-tika/ivy.xml | 46 - src/plugin/parse-tika/plugin.xml | 136 --- src/plugin/parse-tika/sample/encrypted.pdf | Bin 3431 -> 0 bytes src/plugin/parse-tika/sample/nutch.html | 519 ----------- src/plugin/parse-tika/sample/nutch_logo_tm.gif | Bin 2747 -> 0 bytes src/plugin/parse-tika/sample/ootest.odt | Bin 20753 -> 0 bytes src/plugin/parse-tika/sample/ootest.sxw | Bin 20125 -> 0 bytes src/plugin/parse-tika/sample/ootest.txt | 30 - src/plugin/parse-tika/sample/pdftest.pdf | 157 ---- src/plugin/parse-tika/sample/rsstest.rss | 37 - src/plugin/parse-tika/sample/test.rtf | 17 - src/plugin/parse-tika/sample/word97.doc | Bin 8192 -> 0 bytes .../tika/BoilerpipeExtractorRepository.java | 62 -- .../org/apache/nutch/parse/tika/DOMBuilder.java | 794 ----------------- .../nutch/parse/tika/DOMContentUtils.java | 402 --------- .../nutch/parse/tika/HTMLMetaProcessor.java | 214 ----- .../org/apache/nutch/parse/tika/TikaParser.java | 286 ------ .../parse/tika/XMLCharacterRecognizer.java | 112 --- .../apache/nutch/parse/tika/package-info.java | 23 - .../apache/nutch/tika/TestDOMContentUtils.java | 337 ------- .../org/apache/nutch/tika/TestFeedParser.java | 121 --- .../apache/nutch/tika/TestImageMetadata.java | 67 -- .../org/apache/nutch/tika/TestMSWordParser.java | 92 -- .../org/apache/nutch/tika/TestOOParser.java | 107 --- .../org/apache/nutch/tika/TestPdfParser.java | 73 -- .../org/apache/nutch/tika/TestRTFParser.java | 81 -- .../nutch/tika/TestRobotsMetaProcessor.java | 156 ---- src/plugin/parse-zip/build.xml | 38 - src/plugin/parse-zip/ivy.xml | 41 - src/plugin/parse-zip/plugin.xml | 46 - src/plugin/parse-zip/sample/test.zip | Bin 182 -> 0 bytes .../org/apache/nutch/parse/zip/ZipParser.java | 144 --- .../nutch/parse/zip/ZipTextExtractor.java | 120 --- .../apache/nutch/parse/zip/package-info.java | 22 - .../apache/nutch/parse/zip/TestZipParser.java | 71 -- src/plugin/parsefilter-naivebayes/build-ivy.xml | 54 -- src/plugin/parsefilter-naivebayes/build.xml | 22 - src/plugin/parsefilter-naivebayes/ivy.xml | 49 - src/plugin/parsefilter-naivebayes/plugin.xml | 56 -- .../nutch/parsefilter/naivebayes/Classify.java | 120 --- .../naivebayes/NaiveBayesParseFilter.java | 197 ---- .../nutch/parsefilter/naivebayes/Train.java | 148 ---- .../parsefilter/naivebayes/package-info.java | 28 - src/plugin/parsefilter-regex/build.xml | 27 - .../data/regex-parsefilter.txt | 10 - src/plugin/parsefilter-regex/ivy.xml | 37 - src/plugin/parsefilter-regex/plugin.xml | 42 - .../parsefilter/regex/RegexParseFilter.java | 199 ----- .../nutch/parsefilter/regex/package-info.java | 23 - .../parsefilter/regex/TestRegexParseFilter.java | 77 -- src/plugin/plugin.dtd | 206 ----- src/plugin/protocol-file/build.xml | 29 - src/plugin/protocol-file/ivy.xml | 41 - src/plugin/protocol-file/plugin.xml | 46 - .../protocol-file/sample/testprotocolfile.txt | 1 - .../sample/testprotocolfile_(encoded).txt | 1 - .../org/apache/nutch/protocol/file/File.java | 228 ----- .../apache/nutch/protocol/file/FileError.java | 36 - .../nutch/protocol/file/FileException.java | 40 - .../nutch/protocol/file/FileResponse.java | 317 ------- .../org/apache/nutch/protocol/file/package.html | 5 - .../nutch/protocol/file/TestProtocolFile.java | 99 --- src/plugin/protocol-ftp/build.xml | 22 - src/plugin/protocol-ftp/ivy.xml | 42 - src/plugin/protocol-ftp/plugin.xml | 46 - .../org/apache/nutch/protocol/ftp/Client.java | 595 ------------- .../java/org/apache/nutch/protocol/ftp/Ftp.java | 267 ------ .../org/apache/nutch/protocol/ftp/FtpError.java | 36 - .../apache/nutch/protocol/ftp/FtpException.java | 46 - .../ftp/FtpExceptionBadSystResponse.java | 29 - .../FtpExceptionCanNotHaveDataConnection.java | 29 - ...ExceptionControlClosedByForcedDataClose.java | 30 - .../ftp/FtpExceptionUnknownForcedDataClose.java | 30 - .../apache/nutch/protocol/ftp/FtpResponse.java | 521 ----------- .../nutch/protocol/ftp/FtpRobotRulesParser.java | 121 --- .../protocol/ftp/PrintCommandListener.java | 71 -- .../org/apache/nutch/protocol/ftp/package.html | 5 - src/plugin/protocol-htmlunit/build.xml | 37 - src/plugin/protocol-htmlunit/ivy.xml | 38 - src/plugin/protocol-htmlunit/plugin.xml | 51 -- .../apache/nutch/protocol/htmlunit/Http.java | 63 -- .../nutch/protocol/htmlunit/HttpResponse.java | 573 ------------ .../apache/nutch/protocol/htmlunit/package.html | 21 - src/plugin/protocol-http/build.xml | 50 -- src/plugin/protocol-http/ivy.xml | 41 - src/plugin/protocol-http/jsp/basic-http.jsp | 44 - src/plugin/protocol-http/jsp/brokenpage.jsp | 47 - src/plugin/protocol-http/jsp/redirect301.jsp | 49 - src/plugin/protocol-http/jsp/redirect302.jsp | 49 - src/plugin/protocol-http/plugin.xml | 51 -- .../org/apache/nutch/protocol/http/Http.java | 73 -- .../nutch/protocol/http/HttpResponse.java | 558 ------------ .../org/apache/nutch/protocol/http/package.html | 5 - .../src/test/conf/nutch-site-test.xml | 52 -- .../nutch/protocol/http/TestProtocolHttp.java | 140 --- src/plugin/protocol-httpclient/build.xml | 45 - src/plugin/protocol-httpclient/ivy.xml | 42 - src/plugin/protocol-httpclient/jsp/basic.jsp | 74 -- src/plugin/protocol-httpclient/jsp/cookies.jsp | 63 -- src/plugin/protocol-httpclient/jsp/digest.jsp | 68 -- src/plugin/protocol-httpclient/jsp/noauth.jsp | 36 - src/plugin/protocol-httpclient/jsp/ntlm.jsp | 89 -- src/plugin/protocol-httpclient/plugin.xml | 58 -- .../DummySSLProtocolSocketFactory.java | 163 ---- .../httpclient/DummyX509TrustManager.java | 92 -- .../apache/nutch/protocol/httpclient/Http.java | 572 ------------ .../protocol/httpclient/HttpAuthentication.java | 45 - .../httpclient/HttpAuthenticationException.java | 71 -- .../httpclient/HttpAuthenticationFactory.java | 98 -- .../httpclient/HttpBasicAuthentication.java | 199 ----- .../httpclient/HttpFormAuthConfigurer.java | 106 --- .../httpclient/HttpFormAuthentication.java | 223 ----- .../nutch/protocol/httpclient/HttpResponse.java | 216 ----- .../nutch/protocol/httpclient/package.html | 9 - .../src/test/conf/httpclient-auth-test.xml | 58 -- .../src/test/conf/nutch-site-test.xml | 52 -- .../httpclient/TestProtocolHttpClient.java | 217 ----- .../protocol-interactiveselenium/README.md | 38 - .../protocol-interactiveselenium/build-ivy.xml | 54 -- .../protocol-interactiveselenium/build.xml | 37 - src/plugin/protocol-interactiveselenium/ivy.xml | 42 - .../protocol-interactiveselenium/plugin.xml | 47 - .../protocol/interactiveselenium/Http.java | 59 -- .../interactiveselenium/HttpResponse.java | 399 --------- .../DefalultMultiInteractionHandler.java | 53 -- .../DefaultClickAllAjaxLinksHandler.java | 88 -- .../handlers/DefaultHandler.java | 30 - .../handlers/InteractiveSeleniumHandler.java | 25 - .../protocol/interactiveselenium/package.html | 5 - src/plugin/protocol-selenium/README.md | 208 ----- src/plugin/protocol-selenium/build-ivy.xml | 54 -- src/plugin/protocol-selenium/build.xml | 36 - src/plugin/protocol-selenium/ivy.xml | 42 - src/plugin/protocol-selenium/plugin.xml | 47 - .../apache/nutch/protocol/selenium/Http.java | 59 -- .../nutch/protocol/selenium/HttpResponse.java | 360 -------- .../apache/nutch/protocol/selenium/package.html | 5 - src/plugin/scoring-depth/build.xml | 6 - src/plugin/scoring-depth/ivy.xml | 41 - src/plugin/scoring-depth/plugin.xml | 24 - .../nutch/scoring/depth/DepthScoringFilter.java | 207 ----- .../nutch/scoring/depth/package-info.java | 23 - src/plugin/scoring-link/build.xml | 27 - src/plugin/scoring-link/ivy.xml | 41 - src/plugin/scoring-link/plugin.xml | 39 - .../scoring/link/LinkAnalysisScoringFilter.java | 95 -- .../apache/nutch/scoring/link/package-info.java | 23 - src/plugin/scoring-opic/build.xml | 27 - src/plugin/scoring-opic/ivy.xml | 41 - src/plugin/scoring-opic/plugin.xml | 39 - .../nutch/scoring/opic/OPICScoringFilter.java | 173 ---- .../apache/nutch/scoring/opic/package-info.java | 23 - src/plugin/scoring-similarity/build-ivy.xml | 54 -- src/plugin/scoring-similarity/build.xml | 27 - src/plugin/scoring-similarity/ivy.xml | 42 - src/plugin/scoring-similarity/plugin.xml | 45 - .../scoring/similarity/SimilarityModel.java | 38 - .../similarity/SimilarityScoringFilter.java | 70 -- .../similarity/cosine/CosineSimilarity.java | 84 -- .../scoring/similarity/cosine/DocVector.java | 57 -- .../nutch/scoring/similarity/cosine/Model.java | 190 ---- .../scoring/similarity/cosine/package-info.java | 7 - .../similarity/util/LuceneAnalyzerUtil.java | 93 -- .../similarity/util/LuceneTokenizer.java | 166 ---- .../scoring/similarity/util/package-info.java | 24 - src/plugin/subcollection/README.txt | 10 - src/plugin/subcollection/build.xml | 22 - src/plugin/subcollection/ivy.xml | 41 - src/plugin/subcollection/plugin.xml | 41 - .../nutch/collection/CollectionManager.java | 240 ----- .../apache/nutch/collection/Subcollection.java | 259 ------ .../org/apache/nutch/collection/package.html | 36 - .../SubcollectionIndexingFilter.java | 101 --- .../indexer/subcollection/package-info.java | 25 - .../nutch/collection/TestSubcollection.java | 112 --- src/plugin/tld/build.xml | 22 - src/plugin/tld/ivy.xml | 41 - src/plugin/tld/plugin.xml | 51 -- .../nutch/indexer/tld/TLDIndexingFilter.java | 69 -- .../org/apache/nutch/indexer/tld/package.html | 5 - .../nutch/scoring/tld/TLDScoringFilter.java | 114 --- .../org/apache/nutch/scoring/tld/package.html | 5 - src/plugin/urlfilter-automaton/build.xml | 51 -- src/plugin/urlfilter-automaton/ivy.xml | 42 - src/plugin/urlfilter-automaton/plugin.xml | 43 - .../urlfilter-automaton/sample/Benchmarks.rules | 26 - .../urlfilter-automaton/sample/Benchmarks.urls | 297 ------- .../sample/IntranetCrawling.rules | 24 - .../sample/IntranetCrawling.urls | 8 - .../sample/WholeWebCrawling.rules | 19 - .../sample/WholeWebCrawling.urls | 11 - .../urlfilter/automaton/AutomatonURLFilter.java | 116 --- .../nutch/urlfilter/automaton/package.html | 9 - .../automaton/TestAutomatonURLFilter.java | 56 -- src/plugin/urlfilter-domain/build.xml | 28 - src/plugin/urlfilter-domain/data/hosts.txt | 5 - src/plugin/urlfilter-domain/ivy.xml | 41 - src/plugin/urlfilter-domain/plugin.xml | 43 - .../nutch/urlfilter/domain/DomainURLFilter.java | 212 ----- .../nutch/urlfilter/domain/package-info.java | 25 - .../urlfilter/domain/TestDomainURLFilter.java | 67 -- src/plugin/urlfilter-domainblacklist/build.xml | 28 - .../urlfilter-domainblacklist/data/hosts.txt | 5 - src/plugin/urlfilter-domainblacklist/ivy.xml | 41 - src/plugin/urlfilter-domainblacklist/plugin.xml | 43 - .../DomainBlacklistURLFilter.java | 210 ----- .../urlfilter/domainblacklist/package-info.java | 24 - .../TestDomainBlacklistURLFilter.java | 49 - src/plugin/urlfilter-ignoreexempt/README.md | 43 - src/plugin/urlfilter-ignoreexempt/build.xml | 55 -- .../urlfilter-ignoreexempt/data/.donotdelete | 0 src/plugin/urlfilter-ignoreexempt/ivy.xml | 41 - src/plugin/urlfilter-ignoreexempt/plugin.xml | 45 - .../ignoreexempt/ExemptionUrlFilter.java | 101 --- .../urlfilter/ignoreexempt/package-info.java | 24 - src/plugin/urlfilter-prefix/build.xml | 22 - src/plugin/urlfilter-prefix/ivy.xml | 41 - src/plugin/urlfilter-prefix/plugin.xml | 47 - .../nutch/urlfilter/prefix/PrefixURLFilter.java | 178 ---- .../apache/nutch/urlfilter/prefix/package.html | 5 - .../urlfilter/prefix/TestPrefixURLFilter.java | 79 -- src/plugin/urlfilter-regex/build.xml | 51 -- src/plugin/urlfilter-regex/ivy.xml | 41 - src/plugin/urlfilter-regex/plugin.xml | 48 - .../urlfilter-regex/sample/Benchmarks.rules | 26 - .../urlfilter-regex/sample/Benchmarks.urls | 297 ------- .../sample/IntranetCrawling.rules | 27 - .../sample/IntranetCrawling.urls | 8 - .../sample/WholeWebCrawling.rules | 22 - .../sample/WholeWebCrawling.urls | 11 - .../urlfilter-regex/sample/nutch1838.rules | 12 - .../urlfilter-regex/sample/nutch1838.urls | 3 - .../nutch/urlfilter/regex/RegexURLFilter.java | 111 --- .../apache/nutch/urlfilter/regex/package.html | 5 - .../urlfilter/regex/TestRegexURLFilter.java | 61 -- src/plugin/urlfilter-suffix/build.xml | 22 - src/plugin/urlfilter-suffix/ivy.xml | 41 - src/plugin/urlfilter-suffix/plugin.xml | 47 - .../nutch/urlfilter/suffix/SuffixURLFilter.java | 331 ------- .../nutch/urlfilter/suffix/package-info.java | 23 - .../urlfilter/suffix/TestSuffixURLFilter.java | 123 --- src/plugin/urlfilter-validator/build.xml | 22 - src/plugin/urlfilter-validator/ivy.xml | 41 - src/plugin/urlfilter-validator/plugin.xml | 41 - .../nutch/urlfilter/validator/UrlValidator.java | 386 -------- .../nutch/urlfilter/validator/package.html | 9 - .../urlfilter/validator/TestUrlValidator.java | 79 -- src/plugin/urlmeta/build.xml | 22 - src/plugin/urlmeta/ivy.xml | 41 - src/plugin/urlmeta/plugin.xml | 47 - .../indexer/urlmeta/URLMetaIndexingFilter.java | 118 --- .../apache/nutch/indexer/urlmeta/package.html | 12 - .../scoring/urlmeta/URLMetaScoringFilter.java | 175 ---- .../apache/nutch/scoring/urlmeta/package.html | 11 - src/plugin/urlnormalizer-ajax/build.xml | 22 - src/plugin/urlnormalizer-ajax/ivy.xml | 41 - src/plugin/urlnormalizer-ajax/plugin.xml | 41 - .../urlnormalizer/ajax/AjaxURLNormalizer.java | 236 ----- .../ajax/TestAjaxURLNormalizer.java | 67 -- src/plugin/urlnormalizer-basic/build.xml | 22 - src/plugin/urlnormalizer-basic/ivy.xml | 41 - src/plugin/urlnormalizer-basic/plugin.xml | 41 - .../urlnormalizer/basic/BasicURLNormalizer.java | 290 ------ .../net/urlnormalizer/basic/package-info.java | 23 - .../basic/TestBasicURLNormalizer.java | 175 ---- src/plugin/urlnormalizer-host/build.xml | 27 - src/plugin/urlnormalizer-host/data/hosts.txt | 8 - src/plugin/urlnormalizer-host/ivy.xml | 41 - src/plugin/urlnormalizer-host/plugin.xml | 43 - .../urlnormalizer/host/HostURLNormalizer.java | 198 ----- .../net/urlnormalizer/host/package-info.java | 23 - .../host/TestHostURLNormalizer.java | 57 -- src/plugin/urlnormalizer-pass/build.xml | 22 - src/plugin/urlnormalizer-pass/ivy.xml | 41 - src/plugin/urlnormalizer-pass/plugin.xml | 41 - .../urlnormalizer/pass/PassURLNormalizer.java | 49 - .../net/urlnormalizer/pass/package-info.java | 23 - .../pass/TestPassURLNormalizer.java | 45 - src/plugin/urlnormalizer-protocol/build.xml | 27 - .../urlnormalizer-protocol/data/protocols.txt | 7 - src/plugin/urlnormalizer-protocol/ivy.xml | 41 - src/plugin/urlnormalizer-protocol/plugin.xml | 43 - .../protocol/ProtocolURLNormalizer.java | 190 ---- .../protocol/TestProtocolURLNormalizer.java | 55 -- src/plugin/urlnormalizer-querystring/build.xml | 22 - src/plugin/urlnormalizer-querystring/ivy.xml | 41 - src/plugin/urlnormalizer-querystring/plugin.xml | 42 - .../querystring/QuerystringURLNormalizer.java | 91 -- .../urlnormalizer/querystring/package-info.java | 23 - .../TestQuerystringURLNormalizer.java | 49 - src/plugin/urlnormalizer-regex/build.xml | 34 - src/plugin/urlnormalizer-regex/ivy.xml | 41 - src/plugin/urlnormalizer-regex/plugin.xml | 41 - .../sample/regex-normalize-default.test | 84 -- .../sample/regex-normalize-default.xml | 66 -- .../sample/regex-normalize-scope1.test | 8 - .../sample/regex-normalize-scope1.xml | 21 - .../urlnormalizer/regex/RegexURLNormalizer.java | 324 ------- .../net/urlnormalizer/regex/package-info.java | 23 - .../regex/TestRegexURLNormalizer.java | 186 ---- src/plugin/urlnormalizer-slash/build.xml | 27 - src/plugin/urlnormalizer-slash/data/slashes.txt | 7 - src/plugin/urlnormalizer-slash/ivy.xml | 41 - src/plugin/urlnormalizer-slash/plugin.xml | 43 - .../urlnormalizer/slash/SlashURLNormalizer.java | 224 ----- .../slash/TestSlashURLNormalizer.java | 73 -- src/test/crawl-tests.xml | 62 -- src/test/domain-urlfilter.txt | 22 - src/test/filter-all.txt | 7 - src/test/log4j.properties | 7 - src/test/nutch-site.xml | 19 - .../nutch/crawl/ContinuousCrawlTestUtil.java | 270 ------ .../org/apache/nutch/crawl/CrawlDBTestUtil.java | 179 ---- .../nutch/crawl/CrawlDbUpdateTestDriver.java | 138 --- .../apache/nutch/crawl/CrawlDbUpdateUtil.java | 166 ---- .../org/apache/nutch/crawl/DummyWritable.java | 32 - .../nutch/crawl/TODOTestCrawlDbStates.java | 168 ---- .../nutch/crawl/TestAdaptiveFetchSchedule.java | 121 --- .../apache/nutch/crawl/TestCrawlDbFilter.java | 145 --- .../apache/nutch/crawl/TestCrawlDbMerger.java | 160 ---- .../apache/nutch/crawl/TestCrawlDbStates.java | 566 ------------ .../org/apache/nutch/crawl/TestGenerator.java | 370 -------- .../org/apache/nutch/crawl/TestInjector.java | 181 ---- .../apache/nutch/crawl/TestLinkDbMerger.java | 160 ---- .../nutch/crawl/TestSignatureFactory.java | 35 - .../org/apache/nutch/fetcher/TestFetcher.java | 207 ----- .../nutch/indexer/TestIndexerMapReduce.java | 187 ---- .../nutch/indexer/TestIndexingFilters.java | 110 --- .../org/apache/nutch/metadata/TestMetadata.java | 281 ------ .../metadata/TestSpellCheckedMetadata.java | 303 ------- .../org/apache/nutch/net/TestURLFilters.java | 41 - .../apache/nutch/net/TestURLNormalizers.java | 83 -- .../nutch/parse/TestOutlinkExtractor.java | 99 --- .../org/apache/nutch/parse/TestParseData.java | 58 -- .../org/apache/nutch/parse/TestParseText.java | 34 - .../apache/nutch/parse/TestParserFactory.java | 105 --- .../apache/nutch/parse/parse-plugin-test.xml | 58 -- .../nutch/plugin/HelloWorldExtension.java | 36 - .../org/apache/nutch/plugin/ITestExtension.java | 27 - .../apache/nutch/plugin/SimpleTestPlugin.java | 57 -- .../apache/nutch/plugin/TestPluginSystem.java | 302 ------- .../org/apache/nutch/protocol/TestContent.java | 94 -- .../nutch/protocol/TestProtocolFactory.java | 85 -- .../apache/nutch/segment/TestSegmentMerger.java | 131 --- .../segment/TestSegmentMergerCrawlDatums.java | 427 --------- .../apache/nutch/service/TestNutchServer.java | 65 -- .../nutch/tools/TestCommonCrawlDataDumper.java | 125 --- .../tools/proxy/AbstractTestbedHandler.java | 49 - .../apache/nutch/tools/proxy/DelayHandler.java | 56 -- .../apache/nutch/tools/proxy/FakeHandler.java | 102 --- .../nutch/tools/proxy/LogDebugHandler.java | 64 -- .../nutch/tools/proxy/NotFoundHandler.java | 40 - .../apache/nutch/tools/proxy/ProxyTestbed.java | 156 ---- .../nutch/tools/proxy/SegmentHandler.java | 255 ------ .../apache/nutch/tools/proxy/package-info.java | 22 - .../org/apache/nutch/util/DumpFileUtilTest.java | 68 -- .../apache/nutch/util/TestEncodingDetector.java | 90 -- .../org/apache/nutch/util/TestGZIPUtils.java | 241 ----- .../org/apache/nutch/util/TestMimeUtil.java | 127 --- .../org/apache/nutch/util/TestNodeWalker.java | 107 --- .../nutch/util/TestPrefixStringMatcher.java | 115 --- .../org/apache/nutch/util/TestStringUtil.java | 61 -- .../nutch/util/TestSuffixStringMatcher.java | 114 --- .../org/apache/nutch/util/TestTableUtil.java | 75 -- src/test/org/apache/nutch/util/TestURLUtil.java | 281 ------ .../apache/nutch/util/WritableTestUtils.java | 55 -- .../fetch-test-site/dup_of_pagea.html | 11 - .../fetch-test-site/exception.html | 13 - src/testresources/fetch-test-site/index.html | 13 - .../fetch-test-site/nested_spider_trap.html | 23 - src/testresources/fetch-test-site/pagea.html | 11 - src/testresources/fetch-test-site/pageb.html | 11 - src/testresources/fetch-test-site/robots.txt | 0 src/testresources/test-mime-util/test.xlsx | Bin 3950 -> 0 bytes .../20150309101625/content/part-00000/.data.crc | Bin 124 -> 0 bytes .../content/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101625/content/part-00000/data | Bin 14452 -> 0 bytes .../20150309101625/content/part-00000/index | Bin 217 -> 0 bytes .../crawl_fetch/part-00000/.data.crc | Bin 12 -> 0 bytes .../crawl_fetch/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101625/crawl_fetch/part-00000/data | Bin 293 -> 0 bytes .../20150309101625/crawl_fetch/part-00000/index | Bin 217 -> 0 bytes .../crawl_generate/.part-00000.crc | Bin 12 -> 0 bytes .../20150309101625/crawl_generate/part-00000 | Bin 169 -> 0 bytes .../20150309101625/crawl_parse/.part-00000.crc | Bin 68 -> 0 bytes .../20150309101625/crawl_parse/part-00000 | Bin 7627 -> 0 bytes .../parse_data/part-00000/.data.crc | Bin 24 -> 0 bytes .../parse_data/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101625/parse_data/part-00000/data | Bin 1985 -> 0 bytes .../20150309101625/parse_data/part-00000/index | Bin 217 -> 0 bytes .../parse_text/part-00000/.data.crc | Bin 60 -> 0 bytes .../parse_text/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101625/parse_text/part-00000/data | Bin 6554 -> 0 bytes .../20150309101625/parse_text/part-00000/index | Bin 217 -> 0 bytes .../20150309101656/content/part-00000/.data.crc | Bin 3372 -> 0 bytes .../content/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101656/content/part-00000/data | Bin 430250 -> 0 bytes .../20150309101656/content/part-00000/index | Bin 220 -> 0 bytes .../crawl_fetch/part-00000/.data.crc | Bin 104 -> 0 bytes .../crawl_fetch/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101656/crawl_fetch/part-00000/data | Bin 12121 -> 0 bytes .../20150309101656/crawl_fetch/part-00000/index | Bin 220 -> 0 bytes .../crawl_generate/.part-00000.crc | Bin 52 -> 0 bytes .../20150309101656/crawl_generate/part-00000 | Bin 5590 -> 0 bytes .../20150309101656/crawl_parse/.part-00000.crc | Bin 1652 -> 0 bytes .../20150309101656/crawl_parse/part-00000 | Bin 210047 -> 0 bytes .../parse_data/part-00000/.data.crc | Bin 460 -> 0 bytes .../parse_data/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101656/parse_data/part-00000/data | Bin 57355 -> 0 bytes .../20150309101656/parse_data/part-00000/index | Bin 220 -> 0 bytes .../parse_text/part-00000/.data.crc | Bin 1260 -> 0 bytes .../parse_text/part-00000/.index.crc | Bin 12 -> 0 bytes .../20150309101656/parse_text/part-00000/data | Bin 159920 -> 0 bytes .../20150309101656/parse_text/part-00000/index | Bin 220 -> 0 bytes 1973 files changed, 102499 insertions(+), 98774 deletions(-) ----------------------------------------------------------------------
