Author: lewismc
Date: Fri Jan 9 06:34:33 2015
New Revision: 1650447
URL: http://svn.apache.org/r1650447
Log:
NUTCH-1779 Apply formatting to the code
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/api/NutchServer.java
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/NutchServerPoolExecutor.java
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMConfManager.java
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbIterator.java
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbPageConverter.java
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/api/model/response/NutchStatus.java
nutch/branches/2.x/src/java/org/apache/nutch/api/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/api/resources/SeedResource.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/CrawlStatus.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchSchedule.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/NutchWritable.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureFactory.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextProfileSignature.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/URLPartitioner.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/UrlWithScore.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetchEntry.java
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostDb.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbReader.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateJob.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/host/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/CleaningJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexCleaningFilter.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexCleaningFilters.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriter.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexWriters.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFilter.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFilters.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/NutchDocument.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrConstants.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/CreativeCommons.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/DublinCore.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/MetaWrapper.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilter.java
nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilterChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java
nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizer.java
nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizerChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizers.java
nutch/branches/2.x/src/java/org/apache/nutch/net/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/ProtocolException.java
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/Response.java
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/Outlink.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/Parse.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseCallable.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilter.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilters.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginList.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginsReader.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusCodes.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/Parser.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserFactory.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserNotFound.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/CircularDependencyException.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/Extension.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/ExtensionPoint.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/MissingDependencyException.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/Pluggable.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/Plugin.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginClassLoader.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRepository.java
nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginRuntimeException.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/Content.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolNotFound.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolOutput.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusCodes.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRules.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/scoring/ScoreDatum.java
nutch/branches/2.x/src/java/org/apache/nutch/scoring/ScoringFilter.java
nutch/branches/2.x/src/java/org/apache/nutch/scoring/ScoringFilterException.java
nutch/branches/2.x/src/java/org/apache/nutch/scoring/ScoringFilters.java
nutch/branches/2.x/src/java/org/apache/nutch/scoring/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/WebTableCreator.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/ResolveUrls.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/arc/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/DelayHandler.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/FakeHandler.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/LogDebugHandler.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/NotFoundHandler.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/TestbedProxy.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/proxy/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java
nutch/branches/2.x/src/java/org/apache/nutch/util/CommandRunner.java
nutch/branches/2.x/src/java/org/apache/nutch/util/DeflateUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/util/DomUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java
nutch/branches/2.x/src/java/org/apache/nutch/util/FSUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/util/GZIPUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
nutch/branches/2.x/src/java/org/apache/nutch/util/HadoopFSUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/Histogram.java
nutch/branches/2.x/src/java/org/apache/nutch/util/IdentityPageReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/util/LockUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/NodeWalker.java
nutch/branches/2.x/src/java/org/apache/nutch/util/NutchConfiguration.java
nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java
nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJobConf.java
nutch/branches/2.x/src/java/org/apache/nutch/util/NutchTool.java
nutch/branches/2.x/src/java/org/apache/nutch/util/ObjectCache.java
nutch/branches/2.x/src/java/org/apache/nutch/util/PrefixStringMatcher.java
nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/SuffixStringMatcher.java
nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/TimingUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/ToolUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/TrieStringMatcher.java
nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/WebPageWritable.java
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainStatistics.java
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainSuffix.java
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
nutch/branches/2.x/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
nutch/branches/2.x/src/java/org/apache/nutch/util/package-info.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/NutchUiApplication.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/NutchUiServer.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/NutchClient.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/NutchClientFactory.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/model/JobConfig.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/client/model/NutchStatus.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/config/CustomDaoFactory.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/config/CustomTableCreator.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/config/SpringConfiguration.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/model/NutchConfig.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/model/SeedUrl.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/AbstractBasePage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/LogOutPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/SchedulingPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/SearchPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/StatisticsPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/UrlsUploadPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/UserSettingsPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/instances/InstancePanel.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/instances/InstancesPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/seed/SeedPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/service/CrawlService.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/service/NutchService.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
nutch/branches/2.x/src/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
nutch/branches/2.x/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
nutch/branches/2.x/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/branches/2.x/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java
nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
nutch/branches/2.x/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
nutch/branches/2.x/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java
nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
nutch/branches/2.x/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
nutch/branches/2.x/src/plugin/tld/src/test/org/apache/nutch/indexer/tld/TestTLDIndexingFilter.java
nutch/branches/2.x/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
nutch/branches/2.x/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
nutch/branches/2.x/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java
nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java
nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
nutch/branches/2.x/src/test/org/apache/nutch/api/TestAPI.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/DummyWritable.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java
nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestMetadata.java
nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLFilters.java
nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLNormalizers.java
nutch/branches/2.x/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParserFactory.java
nutch/branches/2.x/src/test/org/apache/nutch/plugin/HelloWorldExtension.java
nutch/branches/2.x/src/test/org/apache/nutch/plugin/ITestExtension.java
nutch/branches/2.x/src/test/org/apache/nutch/plugin/SimpleTestPlugin.java
nutch/branches/2.x/src/test/org/apache/nutch/plugin/TestPluginSystem.java
nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestContent.java
nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestProtocolFactory.java
nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java
nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestGZIPUtils.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestMimeUtil.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestNodeWalker.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestStringUtil.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestTableUtil.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java
nutch/branches/2.x/src/test/org/apache/nutch/util/WritableTestUtils.java
nutch/branches/2.x/src/test/org/apache/nutch/webui/client/TestCrawlCycle.java
nutch/branches/2.x/src/test/org/apache/nutch/webui/client/TestNutchClientFactory.java
nutch/branches/2.x/src/test/org/apache/nutch/webui/client/TestRemoteCommandExecutor.java
nutch/branches/2.x/src/test/org/apache/nutch/webui/client/TestRemoteCommandsBatchFactory.java
nutch/branches/2.x/src/test/org/apache/nutch/webui/service/NutchServiceTest.java
nutch/branches/2.x/src/test/org/apache/nutch/webui/view/SpringConfigForTests.java
nutch/branches/2.x/src/test/org/apache/nutch/webui/view/TestColorEnumLabel.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jan 9 06:34:33 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development 2.3-SNAPSHOT
+* NUTCH-1779 Apply formatting to the code (lewismc)
+
* NUTCH-1907 Incorrect output of Outlinks to Hosts within HostDbUpdateReducer
(lewismc)
* NUTCH-1856 Document webpage.avsc and host.avsc (lewismc)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/api/NutchServer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/NutchServer.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/NutchServer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/NutchServer.java Fri Jan
9 06:34:33 2015
@@ -165,7 +165,8 @@ public class NutchServer extends Applica
* Safety and convenience method to determine whether or not it is safe to
* shut down the server. We make this assertion by consulting the
* {@link org.apache.nutch.api.NutchApp#jobManager} for a list of jobs with
- * {@link org.apache.nutch.api.model.response.JobInfo#state} equal to
'RUNNING'.
+ * {@link org.apache.nutch.api.model.response.JobInfo#state} equal to
+ * 'RUNNING'.
*
* @param force
* ignore running tasks
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/NutchServerPoolExecutor.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/impl/NutchServerPoolExecutor.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/NutchServerPoolExecutor.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/NutchServerPoolExecutor.java
Fri Jan 9 06:34:33 2015
@@ -103,7 +103,7 @@ public class NutchServerPoolExecutor ext
public JobInfo getInfo(String jobId) {
for (JobInfo jobInfo : getAllJobs()) {
- if(StringUtils.equals(jobId, jobInfo.getId())){
+ if (StringUtils.equals(jobId, jobInfo.getId())) {
return jobInfo;
}
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMConfManager.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMConfManager.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMConfManager.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMConfManager.java
Fri Jan 9 06:34:33 2015
@@ -89,7 +89,7 @@ public class RAMConfManager implements C
if (!canCreate(nutchConfig)) {
throw new IllegalArgumentException("Config already exists.");
}
-
+
createHadoopConfig(nutchConfig);
return nutchConfig.getConfigId();
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java
Fri Jan 9 06:34:33 2015
@@ -80,9 +80,10 @@ public class RAMJobManager implements Jo
private NutchTool createTool(JobConfig jobConfig, Configuration conf) {
if (StringUtils.isNotBlank(jobConfig.getJobClassName())) {
- return jobFactory.createToolByClassName(jobConfig.getJobClassName(),
conf);
+ return jobFactory
+ .createToolByClassName(jobConfig.getJobClassName(), conf);
}
-
+
return jobFactory.createToolByType(jobConfig.getType(), conf);
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbIterator.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbIterator.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbIterator.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbIterator.java
Fri Jan 9 06:34:33 2015
@@ -100,7 +100,8 @@ public class DbIterator extends Unmodifi
}
private Map<String, Object> pageAsMap(String url, WebPage page) {
- Map<String, Object> result = DbPageConverter.convertPage(page,
commonFields);
+ Map<String, Object> result = DbPageConverter
+ .convertPage(page, commonFields);
if (CollectionUtils.isEmpty(commonFields) || commonFields.contains("url"))
{
result.put("url", TableUtil.unreverseUrl(url));
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbPageConverter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbPageConverter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbPageConverter.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/db/DbPageConverter.java
Fri Jan 9 06:34:33 2015
@@ -103,7 +103,7 @@ public class DbPageConverter {
if (CollectionUtils.isEmpty(queryFields)) {
return Sets.newHashSet(pageFields);
}
-
+
Set<Field> filteredFields = Sets.newLinkedHashSet();
for (Field field : pageFields) {
if (queryFields.contains(field.name())) {
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/api/impl/package-info.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/impl/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/impl/package-info.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/impl/package-info.java Fri
Jan 9 06:34:33 2015
@@ -19,3 +19,4 @@
* Implementations of REST API interfaces.
*/
package org.apache.nutch.api.impl;
+
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/api/model/response/NutchStatus.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/model/response/NutchStatus.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/api/model/response/NutchStatus.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/api/model/response/NutchStatus.java
Fri Jan 9 06:34:33 2015
@@ -54,18 +54,16 @@ public class NutchStatus {
this.jobs = jobs;
}
- public Collection<JobInfo> getRunningJobs()
- {
+ public Collection<JobInfo> getRunningJobs() {
return purgeFinishedFailedJobs(runningJobs);
}
-
public void setRunningJobs(Collection<JobInfo> runningJobs) {
this.runningJobs = runningJobs;
}
- private Collection<JobInfo> purgeFinishedFailedJobs(Collection<JobInfo>
runningJobColl)
- {
+ private Collection<JobInfo> purgeFinishedFailedJobs(
+ Collection<JobInfo> runningJobColl) {
if (CollectionUtils.isNotEmpty(runningJobColl)) {
Iterator<JobInfo> runningJobsIterator = runningJobColl.iterator();
while (runningJobsIterator.hasNext()) {
@@ -73,8 +71,7 @@ public class NutchStatus {
if (jobInfo.getState().equals(State.FINISHED)) {
runningJobsIterator.remove();
- }
- else if (jobInfo.getState().equals(State.FAILED)) {
+ } else if (jobInfo.getState().equals(State.FAILED)) {
runningJobsIterator.remove();
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/api/package-info.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/package-info.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/package-info.java Fri Jan
9 06:34:33 2015
@@ -19,3 +19,4 @@
* REST API to run and control crawl jobs.
*/
package org.apache.nutch.api;
+
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/api/resources/SeedResource.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/resources/SeedResource.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/api/resources/SeedResource.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/api/resources/SeedResource.java
Fri Jan 9 06:34:33 2015
@@ -43,7 +43,8 @@ import com.google.common.io.Files;
@Path("/seed")
public class SeedResource extends AbstractResource {
- private static final Logger log =
LoggerFactory.getLogger(AdminResource.class);
+ private static final Logger log = LoggerFactory
+ .getLogger(AdminResource.class);
@POST
@Path("/create")
@@ -101,8 +102,8 @@ public class SeedResource extends Abstra
private RuntimeException handleException(Exception e) {
log.error("Cannot create seed file!", e);
- return new
WebApplicationException(status(Status.INTERNAL_SERVER_ERROR).entity(
- "Cannot create seed file!").build());
+ return new WebApplicationException(status(Status.INTERNAL_SERVER_ERROR)
+ .entity("Cannot create seed file!").build());
}
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
Fri Jan 9 06:34:33 2015
@@ -29,13 +29,13 @@ import java.util.Set;
/**
* This class provides common methods for implementations of
* {@link FetchSchedule}.
- *
+ *
* @author Andrzej Bialecki
*/
-public abstract class AbstractFetchSchedule
-extends Configured
-implements FetchSchedule {
- private static final Logger LOG =
LoggerFactory.getLogger(AbstractFetchSchedule.class);
+public abstract class AbstractFetchSchedule extends Configured implements
+ FetchSchedule {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(AbstractFetchSchedule.class);
protected int defaultInterval;
protected int maxInterval;
@@ -59,20 +59,22 @@ implements FetchSchedule {
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
- if (conf == null) return;
+ if (conf == null)
+ return;
defaultInterval = conf.getInt("db.fetch.interval.default", 0);
- maxInterval = conf.getInt("db.fetch.interval.max", 0 );
+ maxInterval = conf.getInt("db.fetch.interval.max", 0);
LOG.info("defaultInterval=" + defaultInterval);
LOG.info("maxInterval=" + maxInterval);
}
-
+
/**
- * Initialize fetch schedule related data. Implementations should at least
- * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
- * implementation sets the <code>fetchTime</code> to now, using the
- * default <code>fetchInterval</code>.
- *
- * @param url URL of the page.
+ * Initialize fetch schedule related data. Implementations should at least
set
+ * the <code>fetchTime</code> and <code>fetchInterval</code>. The default
+ * implementation sets the <code>fetchTime</code> to now, using the default
+ * <code>fetchInterval</code>.
+ *
+ * @param url
+ * URL of the page.
* @param page
*/
@Override
@@ -84,27 +86,31 @@ implements FetchSchedule {
/**
* Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
- * successfully fetched page. NOTE: this implementation resets the
- * retry counter - extending classes should call super.setFetchSchedule() to
+ * successfully fetched page. NOTE: this implementation resets the retry
+ * counter - extending classes should call super.setFetchSchedule() to
* preserve this behavior.
*/
@Override
- public void setFetchSchedule(String url, WebPage page,
- long prevFetchTime, long prevModifiedTime,
- long fetchTime, long modifiedTime, int state) {
+ public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
+ long prevModifiedTime, long fetchTime, long modifiedTime, int state) {
page.setRetriesSinceFetch(0);
}
/**
- * This method specifies how to schedule refetching of pages
- * marked as GONE. Default implementation increases fetchInterval by 50%
- * but the value may never exceed <code>maxInterval</code>.
- * @param url URL of the page
+ * This method specifies how to schedule refetching of pages marked as GONE.
+ * Default implementation increases fetchInterval by 50% but the value may
+ * never exceed <code>maxInterval</code>.
+ *
+ * @param url
+ * URL of the page
* @param page
* @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
- * implementations should make sure that it contains at least all
- * information from {@param datum}.
+ * NOTE: this may be a different instance than
+ * @param datum
+ * , but implementations should make sure that it contains at least
+ * all information from
+ * @param datum
+ * .
*/
@Override
public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
@@ -121,25 +127,30 @@ implements FetchSchedule {
}
/**
- * This method adjusts the fetch schedule if fetching needs to be
- * re-tried due to transient errors. The default implementation
- * sets the next fetch time 1 day in the future and increases
- * the retry counter.
- * @param url URL of the page
+ * This method adjusts the fetch schedule if fetching needs to be re-tried
due
+ * to transient errors. The default implementation sets the next fetch time 1
+ * day in the future and increases the retry counter.
+ *
+ * @param url
+ * URL of the page
* @param page
- * @param prevFetchTime previous fetch time
- * @param prevModifiedTime previous modified time
- * @param fetchTime current fetch time
+ * @param prevFetchTime
+ * previous fetch time
+ * @param prevModifiedTime
+ * previous modified time
+ * @param fetchTime
+ * current fetch time
*/
@Override
public void setPageRetrySchedule(String url, WebPage page,
- long prevFetchTime, long prevModifiedTime, long fetchTime) {
+ long prevFetchTime, long prevModifiedTime, long fetchTime) {
page.setFetchTime(fetchTime + SECONDS_PER_DAY * 1000L);
page.setRetriesSinceFetch(page.getRetriesSinceFetch() + 1);
}
/**
* This method return the last fetch time of the CrawlDatum
+ *
* @return the date as a long.
*/
@Override
@@ -148,20 +159,24 @@ implements FetchSchedule {
}
/**
- * This method provides information whether the page is suitable for
- * selection in the current fetchlist. NOTE: a true return value does not
- * guarantee that the page will be fetched, it just allows it to be
- * included in the further selection process based on scores. The default
- * implementation checks <code>fetchTime</code>, if it is higher than the
- * {@param curTime} it returns false, and true otherwise. It will also
- * check that fetchTime is not too remote (more than
<code>maxInterval</code),
- * in which case it lowers the interval and returns true.
- * @param url URL of the page
+ * This method provides information whether the page is suitable for
selection
+ * in the current fetchlist. NOTE: a true return value does not guarantee
that
+ * the page will be fetched, it just allows it to be included in the further
+ * selection process based on scores. The default implementation checks
+ * <code>fetchTime</code>, if it is higher than the
+ *
+ * @param curTime
+ * it returns false, and true otherwise. It will also check that
+ * fetchTime is not too remote (more than <code>maxInterval</code),
+ * in which case it lowers the interval and returns true.
+ * @param url
+ * URL of the page
* @param page
- * @param curTime reference time (usually set to the time when the
- * fetchlist generation process was started).
+ * @param curTime
+ * reference time (usually set to the time when the fetchlist
+ * generation process was started).
* @return true, if the page should be considered for inclusion in the
current
- * fetchlist, otherwise false.
+ * fetchlist, otherwise false.
*/
@Override
public boolean shouldFetch(String url, WebPage page, long curTime) {
@@ -181,11 +196,14 @@ implements FetchSchedule {
/**
* This method resets fetchTime, fetchInterval, modifiedTime,
* retriesSinceFetch and page signature, so that it forces refetching.
- * @param url URL of the page
+ *
+ * @param url
+ * URL of the page
* @param page
- * @param asap if true, force refetch as soon as possible - this sets
- * the fetchTime to now. If false, force refetch whenever the next fetch
- * time is set.
+ * @param asap
+ * if true, force refetch as soon as possible - this sets the
+ * fetchTime to now. If false, force refetch whenever the next fetch
+ * time is set.
*/
@Override
public void forceRefetch(String url, WebPage page, boolean asap) {
@@ -196,10 +214,10 @@ implements FetchSchedule {
page.setRetriesSinceFetch(0);
// TODO: row.setSignature(null) ??
page.setModifiedTime(0L);
- if (asap) page.setFetchTime(System.currentTimeMillis());
+ if (asap)
+ page.setFetchTime(System.currentTimeMillis());
}
-
public Set<WebPage.Field> getFields() {
return FIELDS;
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
Fri Jan 9 06:34:33 2015
@@ -30,11 +30,12 @@ import org.apache.nutch.storage.WebPage;
* If SYNC_DELTA property is true, then:
* <ul>
* <li>calculate a <code>delta = fetchTime - modifiedTime</code></li>
- * <li>try to synchronize with the time of change, by shifting the next
fetchTime
- * by a fraction of the difference between the last modification time and the
last
- * fetch time. I.e. the next fetch time will be set to
+ * <li>try to synchronize with the time of change, by shifting the next
+ * fetchTime by a fraction of the difference between the last modification time
+ * and the last fetch time. I.e. the next fetch time will be set to
* <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li>
- * <li>if the adjusted fetch interval is bigger than the delta, then
<code>fetchInterval = delta</code>.</li>
+ * <li>if the adjusted fetch interval is bigger than the delta, then
+ * <code>fetchInterval = delta</code>.</li>
* </ul>
* </li>
* <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL
@@ -42,10 +43,13 @@ import org.apache.nutch.storage.WebPage;
* <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL
* (default is 365 days).</li>
* </ul>
- * <p>NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may
destabilize the algorithm,
- * so that the fetch interval either increases or decreases infinitely, with
little
- * relevance to the page changes. Please use {@link #main(String[])} method to
- * test the values before applying them in a production system.</p>
+ * <p>
+ * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize
+ * the algorithm, so that the fetch interval either increases or decreases
+ * infinitely, with little relevance to the page changes. Please use
+ * {@link #main(String[])} method to test the values before applying them in a
+ * production system.
+ * </p>
*
* @author Andrzej Bialecki
*/
@@ -58,56 +62,61 @@ public class AdaptiveFetchSchedule exten
private int MAX_INTERVAL;
private int MIN_INTERVAL;
-
+
private boolean SYNC_DELTA;
private double SYNC_DELTA_RATE;
-
+
public void setConf(Configuration conf) {
super.setConf(conf);
- if (conf == null) return;
+ if (conf == null)
+ return;
INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60);
- MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval",
SECONDS_PER_DAY * 365 ); // 1 year
+ MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval",
+ SECONDS_PER_DAY * 365); // 1 year
SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta",
true);
- SYNC_DELTA_RATE =
conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
+ SYNC_DELTA_RATE = conf.getFloat(
+ "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
}
@Override
- public void setFetchSchedule(String url, WebPage page,
- long prevFetchTime, long prevModifiedTime,
- long fetchTime, long modifiedTime, int state) {
+ public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
+ long prevModifiedTime, long fetchTime, long modifiedTime, int state) {
super.setFetchSchedule(url, page, prevFetchTime, prevModifiedTime,
fetchTime, modifiedTime, state);
long refTime = fetchTime;
- if (modifiedTime <= 0) modifiedTime = fetchTime;
+ if (modifiedTime <= 0)
+ modifiedTime = fetchTime;
int interval = page.getFetchInterval();
switch (state) {
- case FetchSchedule.STATUS_MODIFIED:
- interval *= (1.0f - DEC_RATE);
- break;
- case FetchSchedule.STATUS_NOTMODIFIED:
- interval *= (1.0f + INC_RATE);
- break;
- case FetchSchedule.STATUS_UNKNOWN:
- break;
+ case FetchSchedule.STATUS_MODIFIED:
+ interval *= (1.0f - DEC_RATE);
+ break;
+ case FetchSchedule.STATUS_NOTMODIFIED:
+ interval *= (1.0f + INC_RATE);
+ break;
+ case FetchSchedule.STATUS_UNKNOWN:
+ break;
}
if (SYNC_DELTA) {
// try to synchronize with the time of change
// TODO: different from normal class (is delta in seconds)?
- int delta = (int) ((fetchTime - modifiedTime) / 1000L) ;
- if (delta > interval) interval = delta;
+ int delta = (int) ((fetchTime - modifiedTime) / 1000L);
+ if (delta > interval)
+ interval = delta;
refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE);
}
- if (interval < MIN_INTERVAL) interval = MIN_INTERVAL;
- if (interval > MAX_INTERVAL) interval = MAX_INTERVAL;
-
+ if (interval < MIN_INTERVAL)
+ interval = MIN_INTERVAL;
+ if (interval > MAX_INTERVAL)
+ interval = MAX_INTERVAL;
+
page.setFetchInterval(interval);
page.setFetchTime(refTime + interval * 1000L);
page.setModifiedTime(modifiedTime);
page.setPrevModifiedTime(prevModifiedTime);
}
-
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/CrawlStatus.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/CrawlStatus.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/CrawlStatus.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/CrawlStatus.java Fri Jan
9 06:34:33 2015
@@ -21,22 +21,22 @@ import java.util.Map;
public class CrawlStatus {
/** Page was not fetched yet. */
- public static final byte STATUS_UNFETCHED = 0x01;
+ public static final byte STATUS_UNFETCHED = 0x01;
/** Page was successfully fetched. */
- public static final byte STATUS_FETCHED = 0x02;
+ public static final byte STATUS_FETCHED = 0x02;
/** Page no longer exists. */
- public static final byte STATUS_GONE = 0x03;
+ public static final byte STATUS_GONE = 0x03;
/** Page temporarily redirects to other page. */
- public static final byte STATUS_REDIR_TEMP = 0x04;
+ public static final byte STATUS_REDIR_TEMP = 0x04;
/** Page permanently redirects to other page. */
- public static final byte STATUS_REDIR_PERM = 0x05;
+ public static final byte STATUS_REDIR_PERM = 0x05;
/** Fetching unsuccessful, needs to be retried (transient errors). */
- public static final byte STATUS_RETRY = 0x22;
+ public static final byte STATUS_RETRY = 0x22;
/** Fetching successful - page is not modified. */
- public static final byte STATUS_NOTMODIFIED = 0x26;
-
+ public static final byte STATUS_NOTMODIFIED = 0x26;
+
private static final Map<Byte, String> NAMES = new HashMap<Byte, String>();
-
+
static {
NAMES.put(STATUS_UNFETCHED, "status_unfetched");
NAMES.put(STATUS_FETCHED, "status_fetched");
@@ -46,9 +46,9 @@ public class CrawlStatus {
NAMES.put(STATUS_RETRY, "status_retry");
NAMES.put(STATUS_NOTMODIFIED, "status_notmodified");
}
-
+
public static String getName(byte status) {
return NAMES.get(status);
}
-
+
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java Fri
Jan 9 06:34:33 2015
@@ -37,8 +37,8 @@ import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.WebPageWritable;
import org.apache.gora.mapreduce.GoraMapper;
-public class DbUpdateMapper
-extends GoraMapper<String, WebPage, UrlWithScore, NutchWritable> {
+public class DbUpdateMapper extends
+ GoraMapper<String, WebPage, UrlWithScore, NutchWritable> {
public static final Logger LOG = DbUpdaterJob.LOG;
private ScoringFilters scoringFilters;
@@ -46,42 +46,45 @@ extends GoraMapper<String, WebPage, UrlW
private final List<ScoreDatum> scoreData = new ArrayList<ScoreDatum>();
private Utf8 batchId;
-
- //reuse writables
+
+ // reuse writables
private UrlWithScore urlWithScore = new UrlWithScore();
private NutchWritable nutchWritable = new NutchWritable();
private WebPageWritable pageWritable;
@Override
public void map(String key, WebPage page, Context context)
- throws IOException, InterruptedException {
- if(Mark.GENERATE_MARK.checkMark(page) == null) {
+ throws IOException, InterruptedException {
+ if (Mark.GENERATE_MARK.checkMark(page) == null) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Skipping " + TableUtil.unreverseUrl(key) + "; not generated
yet");
+ LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
+ + "; not generated yet");
}
return;
}
-
+
String url = TableUtil.unreverseUrl(key);
scoreData.clear();
Map<CharSequence, CharSequence> outlinks = page.getOutlinks();
if (outlinks != null) {
for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) {
- int depth=Integer.MAX_VALUE;
+ int depth = Integer.MAX_VALUE;
CharSequence depthUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
- if (depthUtf8 != null) depth=Integer.parseInt(depthUtf8.toString());
- scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(),
- e.getValue().toString(), depth));
+ if (depthUtf8 != null)
+ depth = Integer.parseInt(depthUtf8.toString());
+ scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue()
+ .toString(), depth));
}
}
// TODO: Outlink filtering (i.e. "only keep the first n outlinks")
try {
- scoringFilters.distributeScoreToOutlinks(url, page, scoreData, (outlinks
== null ? 0 : outlinks.size()));
+ scoringFilters.distributeScoreToOutlinks(url, page, scoreData,
+ (outlinks == null ? 0 : outlinks.size()));
} catch (ScoringFilterException e) {
- LOG.warn("Distributing score failed for URL: " + key +
- " exception:" + StringUtils.stringifyException(e));
+ LOG.warn("Distributing score failed for URL: " + key + " exception:"
+ + StringUtils.stringifyException(e));
}
urlWithScore.setUrl(key);
@@ -104,7 +107,8 @@ extends GoraMapper<String, WebPage, UrlW
public void setup(Context context) {
scoringFilters = new ScoringFilters(context.getConfiguration());
pageWritable = new WebPageWritable(context.getConfiguration(), null);
- batchId = new
Utf8(context.getConfiguration().get(Nutch.BATCH_NAME_KEY,Nutch.ALL_BATCH_ID_STR));
+ batchId = new Utf8(context.getConfiguration().get(Nutch.BATCH_NAME_KEY,
+ Nutch.ALL_BATCH_ID_STR));
}
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Fri
Jan 9 06:34:33 2015
@@ -37,11 +37,11 @@ import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.WebPageWritable;
import org.slf4j.Logger;
-public class DbUpdateReducer
-extends GoraReducer<UrlWithScore, NutchWritable, String, WebPage> {
+public class DbUpdateReducer extends
+ GoraReducer<UrlWithScore, NutchWritable, String, WebPage> {
+
+ public static final String CRAWLDB_ADDITIONS_ALLOWED =
"db.update.additions.allowed";
- public static final String CRAWLDB_ADDITIONS_ALLOWED =
"db.update.additions.allowed";
-
public static final Logger LOG = DbUpdaterJob.LOG;
private int retryMax;
@@ -53,11 +53,12 @@ extends GoraReducer<UrlWithScore, NutchW
private int maxLinks;
@Override
- protected void setup(Context context) throws IOException,
InterruptedException {
+ protected void setup(Context context) throws IOException,
+ InterruptedException {
Configuration conf = context.getConfiguration();
retryMax = conf.getInt("db.fetch.retry.max", 3);
additionsAllowed = conf.getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
- maxInterval = conf.getInt("db.fetch.interval.max", 0 );
+ maxInterval = conf.getInt("db.fetch.interval.max", 0);
schedule = FetchScheduleFactory.getFetchSchedule(conf);
scoringFilters = new ScoringFilters(conf);
maxLinks = conf.getInt("db.update.max.inlinks", 10000);
@@ -70,7 +71,7 @@ extends GoraReducer<UrlWithScore, NutchW
WebPage page = null;
inlinkedScoreData.clear();
-
+
for (NutchWritable nutchWritable : values) {
Writable val = nutchWritable.get();
if (val instanceof WebPageWritable) {
@@ -108,10 +109,10 @@ extends GoraReducer<UrlWithScore, NutchW
} else {
byte status = page.getStatus().byteValue();
switch (status) {
- case CrawlStatus.STATUS_FETCHED: // succesful fetch
- case CrawlStatus.STATUS_REDIR_TEMP: // successful fetch, redirected
+ case CrawlStatus.STATUS_FETCHED: // succesful fetch
+ case CrawlStatus.STATUS_REDIR_TEMP: // successful fetch, redirected
case CrawlStatus.STATUS_REDIR_PERM:
- case CrawlStatus.STATUS_NOTMODIFIED: // successful fetch, notmodified
+ case CrawlStatus.STATUS_NOTMODIFIED: // successful fetch, notmodified
int modified = FetchSchedule.STATUS_UNKNOWN;
if (status == CrawlStatus.STATUS_NOTMODIFIED) {
modified = FetchSchedule.STATUS_NOTMODIFIED;
@@ -129,8 +130,9 @@ extends GoraReducer<UrlWithScore, NutchW
long prevFetchTime = page.getPrevFetchTime();
long modifiedTime = page.getModifiedTime();
long prevModifiedTime = page.getPrevModifiedTime();
- CharSequence lastModified = page.getHeaders().get(new
Utf8("Last-Modified"));
- if ( lastModified != null ){
+ CharSequence lastModified = page.getHeaders().get(
+ new Utf8("Last-Modified"));
+ if (lastModified != null) {
try {
modifiedTime = HttpDateFormat.toLong(lastModified.toString());
prevModifiedTime = page.getModifiedTime();
@@ -143,15 +145,17 @@ extends GoraReducer<UrlWithScore, NutchW
schedule.forceRefetch(url, page, false);
break;
case CrawlStatus.STATUS_RETRY:
- schedule.setPageRetrySchedule(url, page, 0L,
page.getPrevModifiedTime(), page.getFetchTime());
+ schedule.setPageRetrySchedule(url, page, 0L,
+ page.getPrevModifiedTime(), page.getFetchTime());
if (page.getRetriesSinceFetch() < retryMax) {
- page.setStatus((int)CrawlStatus.STATUS_UNFETCHED);
+ page.setStatus((int) CrawlStatus.STATUS_UNFETCHED);
} else {
- page.setStatus((int)CrawlStatus.STATUS_GONE);
+ page.setStatus((int) CrawlStatus.STATUS_GONE);
}
break;
case CrawlStatus.STATUS_GONE:
- schedule.setPageGoneSchedule(url, page, 0L,
page.getPrevModifiedTime(), page.getFetchTime());
+ schedule.setPageGoneSchedule(url, page, 0L, page.getPrevModifiedTime(),
+ page.getFetchTime());
break;
}
}
@@ -159,35 +163,39 @@ extends GoraReducer<UrlWithScore, NutchW
if (page.getInlinks() != null) {
page.getInlinks().clear();
}
-
+
// Distance calculation.
// Retrieve smallest distance from all inlinks distances
// Calculate new distance for current page: smallest inlink distance plus
1.
- // If the new distance is smaller than old one (or if old did not exist
yet),
+ // If the new distance is smaller than old one (or if old did not exist
+ // yet),
// write it to the page.
- int smallestDist=Integer.MAX_VALUE;
+ int smallestDist = Integer.MAX_VALUE;
for (ScoreDatum inlink : inlinkedScoreData) {
int inlinkDist = inlink.getDistance();
if (inlinkDist < smallestDist) {
- smallestDist=inlinkDist;
+ smallestDist = inlinkDist;
}
- page.getInlinks().put(new Utf8(inlink.getUrl()), new
Utf8(inlink.getAnchor()));
+ page.getInlinks().put(new Utf8(inlink.getUrl()),
+ new Utf8(inlink.getAnchor()));
}
if (smallestDist != Integer.MAX_VALUE) {
- int oldDistance=Integer.MAX_VALUE;
+ int oldDistance = Integer.MAX_VALUE;
CharSequence oldDistUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
- if (oldDistUtf8 !=
null)oldDistance=Integer.parseInt(oldDistUtf8.toString());
- int newDistance = smallestDist+1;
+ if (oldDistUtf8 != null)
+ oldDistance = Integer.parseInt(oldDistUtf8.toString());
+ int newDistance = smallestDist + 1;
if (newDistance < oldDistance) {
- page.getMarkers().put(DbUpdaterJob.DISTANCE, new
Utf8(Integer.toString(newDistance)));
+ page.getMarkers().put(DbUpdaterJob.DISTANCE,
+ new Utf8(Integer.toString(newDistance)));
}
}
try {
scoringFilters.updateScore(url, page, inlinkedScoreData);
} catch (ScoringFilterException e) {
- LOG.warn("Scoring filters failed with exception " +
- StringUtils.stringifyException(e));
+ LOG.warn("Scoring filters failed with exception "
+ + StringUtils.stringifyException(e));
}
// clear markers
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Fri
Jan 9 06:34:33 2015
@@ -48,9 +48,7 @@ public class DbUpdaterJob extends NutchT
public static final Logger LOG = LoggerFactory.getLogger(DbUpdaterJob.class);
-
- private static final Collection<WebPage.Field> FIELDS =
- new HashSet<WebPage.Field>();
+ private static final Collection<WebPage.Field> FIELDS = new
HashSet<WebPage.Field>();
static {
FIELDS.add(WebPage.Field.OUTLINKS);
@@ -78,35 +76,35 @@ public class DbUpdaterJob extends NutchT
public DbUpdaterJob(Configuration conf) {
setConf(conf);
}
-
- public Map<String,Object> run(Map<String,Object> args) throws Exception {
- String crawlId = (String)args.get(Nutch.ARG_CRAWL);
- String batchId = (String)args.get(Nutch.ARG_BATCH);
+
+ public Map<String, Object> run(Map<String, Object> args) throws Exception {
+ String crawlId = (String) args.get(Nutch.ARG_CRAWL);
+ String batchId = (String) args.get(Nutch.ARG_BATCH);
numJobs = 1;
currentJobNum = 0;
-
+
if (batchId == null) {
batchId = Nutch.ALL_BATCH_ID_STR;
}
getConf().set(Nutch.BATCH_NAME_KEY, batchId);
- //job.setBoolean(ALL, updateAll);
+ // job.setBoolean(ALL, updateAll);
ScoringFilters scoringFilters = new ScoringFilters(getConf());
HashSet<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
fields.addAll(scoringFilters.getFields());
-
+
currentJob = new NutchJob(getConf(), "update-table");
if (crawlId != null) {
currentJob.getConfiguration().set(Nutch.CRAWL_ID_KEY, crawlId);
}
-
+
// Partition by {url}, sort by {url,score} and group by {url}.
// This ensures that the inlinks are sorted by score when they enter
// the reducer.
-
+
currentJob.setPartitionerClass(UrlOnlyPartitioner.class);
currentJob.setSortComparatorClass(UrlScoreComparator.class);
currentJob.setGroupingComparatorClass(UrlOnlyComparator.class);
-
+
MapFieldValueFilter<String, WebPage> batchIdFilter =
getBatchIdFilter(batchId);
StorageUtils.initMapperJob(currentJob, fields, UrlWithScore.class,
NutchWritable.class, DbUpdateMapper.class, batchIdFilter);
@@ -129,22 +127,22 @@ public class DbUpdaterJob extends NutchT
return filter;
}
- private int updateTable(String crawlId,String batchId) throws Exception {
-
+ private int updateTable(String crawlId, String batchId) throws Exception {
+
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("DbUpdaterJob: starting at " + sdf.format(start));
-
+
if (batchId.equals(Nutch.ALL_BATCH_ID_STR)) {
LOG.info("DbUpdaterJob: updatinging all");
} else {
LOG.info("DbUpdaterJob: batchId: " + batchId);
}
- run(ToolUtil.toArgMap(Nutch.ARG_CRAWL, crawlId,
- Nutch.ARG_BATCH, batchId));
-
+ run(ToolUtil.toArgMap(Nutch.ARG_CRAWL, crawlId, Nutch.ARG_BATCH, batchId));
+
long finish = System.currentTimeMillis();
- LOG.info("DbUpdaterJob: finished at " + sdf.format(finish) + ", time
elapsed: " + TimingUtil.elapsedTime(start, finish));
+ LOG.info("DbUpdaterJob: finished at " + sdf.format(finish)
+ + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
return 0;
}
@@ -152,9 +150,9 @@ public class DbUpdaterJob extends NutchT
String crawlId = null;
String batchId;
- String usage = "Usage: DbUpdaterJob (<batchId> | -all) [-crawlId <id>] " +
- " <batchId> - crawl identifier returned by Generator, or
-all for all \n \t \t generated batchId-s\n" +
- " -crawlId <id> - the id to prefix the schemas to operate on,
\n \t \t (default: storage.crawl.id)\n";
+ String usage = "Usage: DbUpdaterJob (<batchId> | -all) [-crawlId <id>] "
+ + " <batchId> - crawl identifier returned by Generator, or -all
for all \n \t \t generated batchId-s\n"
+ + " -crawlId <id> - the id to prefix the schemas to operate on, \n
\t \t (default: storage.crawl.id)\n";
if (args.length == 0) {
System.err.println(usage);
@@ -171,14 +169,15 @@ public class DbUpdaterJob extends NutchT
if ("-crawlId".equals(args[i])) {
getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
} else {
- throw new IllegalArgumentException("arg " +args[i]+ " not recognized");
+ throw new IllegalArgumentException("arg " + args[i] + " not
recognized");
}
}
- return updateTable(crawlId,batchId);
+ return updateTable(crawlId, batchId);
}
public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(NutchConfiguration.create(), new DbUpdaterJob(),
args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new DbUpdaterJob(),
+ args);
System.exit(res);
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
Fri Jan 9 06:34:33 2015
@@ -20,19 +20,18 @@ package org.apache.nutch.crawl;
import org.apache.nutch.storage.WebPage;
/**
- * This class implements the default re-fetch schedule. That is, no matter
- * if the page was changed or not, the <code>fetchInterval</code> remains
+ * This class implements the default re-fetch schedule. That is, no matter if
+ * the page was changed or not, the <code>fetchInterval</code> remains
* unchanged, and the updated page fetchTime will always be set to
* <code>fetchTime + fetchInterval * 1000</code>.
- *
+ *
* @author Andrzej Bialecki
*/
public class DefaultFetchSchedule extends AbstractFetchSchedule {
@Override
- public void setFetchSchedule(String url, WebPage page,
- long prevFetchTime, long prevModifiedTime,
- long fetchTime, long modifiedTime, int state) {
+ public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
+ long prevModifiedTime, long fetchTime, long modifiedTime, int state) {
super.setFetchSchedule(url, page, prevFetchTime, prevModifiedTime,
fetchTime, modifiedTime, state);
page.setFetchTime(fetchTime + page.getFetchInterval() * 1000L);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchSchedule.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchSchedule.java Fri
Jan 9 06:34:33 2015
@@ -24,115 +24,142 @@ import org.apache.hadoop.io.Text;
import org.apache.nutch.storage.WebPage;
/**
- * This interface defines the contract for implementations that manipulate
- * fetch times and re-fetch intervals.
- *
+ * This interface defines the contract for implementations that manipulate
fetch
+ * times and re-fetch intervals.
+ *
* @author Andrzej Bialecki
*/
public interface FetchSchedule extends Configurable {
/** It is unknown whether page was changed since our last visit. */
- public static final int STATUS_UNKNOWN = 0;
+ public static final int STATUS_UNKNOWN = 0;
/** Page is known to have been modified since our last visit. */
- public static final int STATUS_MODIFIED = 1;
+ public static final int STATUS_MODIFIED = 1;
/** Page is known to remain unmodified since our last visit. */
- public static final int STATUS_NOTMODIFIED = 2;
+ public static final int STATUS_NOTMODIFIED = 2;
public static final int SECONDS_PER_DAY = 3600 * 24;
/**
- * Initialize fetch schedule related data. Implementations should at least
- * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
- * implementation set the <code>fetchTime</code> to now, using the
- * default <code>fetchInterval</code>.
- *
- * @param url URL of the page.
+ * Initialize fetch schedule related data. Implementations should at least
set
+ * the <code>fetchTime</code> and <code>fetchInterval</code>. The default
+ * implementation set the <code>fetchTime</code> to now, using the default
+ * <code>fetchInterval</code>.
+ *
+ * @param url
+ * URL of the page.
* @param page
*/
public void initializeSchedule(String url, WebPage page);
/**
* Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
- * successfully fetched page.
- * Implementations may use supplied arguments to support different
re-fetching
- * schedules.
- *
- * @param url url of the page
+ * successfully fetched page. Implementations may use supplied arguments to
+ * support different re-fetching schedules.
+ *
+ * @param url
+ * url of the page
* @param page
- * @param prevFetchTime previous value of fetch time, or -1 if not available
- * @param prevModifiedTime previous value of modifiedTime, or -1 if not
available
- * @param fetchTime the latest time, when the page was recently re-fetched.
Most FetchSchedule
- * implementations should update the value in {@param datum} to something
greater than this value.
- * @param modifiedTime last time the content was modified. This information
comes from
- * the protocol implementations, or is set to < 0 if not available. Most
FetchSchedule
- * implementations should update the value in {@param datum} to this value.
- * @param state if {@link #STATUS_MODIFIED}, then the content is considered
to be "changed" before the
- * <code>fetchTime</code>, if {@link #STATUS_NOTMODIFIED} then the content
is known to be unchanged.
- * This information may be obtained by comparing page signatures before and
after fetching. If this
- * is set to {@link #STATUS_UNKNOWN}, then it is unknown whether the page
was changed; implementations
- * are free to follow a sensible default behavior.
- */
- public void setFetchSchedule(String url, WebPage page,
- long prevFetchTime, long prevModifiedTime,
- long fetchTime, long modifiedTime, int state);
-
- /**
- * This method specifies how to schedule refetching of pages
- * marked as GONE. Default implementation increases fetchInterval by 50%,
- * and if it exceeds the <code>maxInterval</code> it calls
+ * @param prevFetchTime
+ * previous value of fetch time, or -1 if not available
+ * @param prevModifiedTime
+ * previous value of modifiedTime, or -1 if not available
+ * @param fetchTime
+ * the latest time, when the page was recently re-fetched. Most
+ * FetchSchedule implementations should update the value in
+ * @param datum
+ * to something greater than this value.
+ * @param modifiedTime
+ * last time the content was modified. This information comes from
+ * the protocol implementations, or is set to < 0 if not available.
+ * Most FetchSchedule implementations should update the value in
+ * @param datum
+ * to this value.
+ * @param state
+ * if {@link #STATUS_MODIFIED}, then the content is considered to be
+ * "changed" before the <code>fetchTime</code>, if
+ * {@link #STATUS_NOTMODIFIED} then the content is known to be
+ * unchanged. This information may be obtained by comparing page
+ * signatures before and after fetching. If this is set to
+ * {@link #STATUS_UNKNOWN}, then it is unknown whether the page was
+ * changed; implementations are free to follow a sensible default
+ * behavior.
+ */
+ public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
+ long prevModifiedTime, long fetchTime, long modifiedTime, int state);
+
+ /**
+ * This method specifies how to schedule refetching of pages marked as GONE.
+ * Default implementation increases fetchInterval by 50%, and if it exceeds
+ * the <code>maxInterval</code> it calls
* {@link #forceRefetch(Text, CrawlDatum, boolean)}.
- * @param url URL of the page
+ *
+ * @param url
+ * URL of the page
* @param page
*/
- public void setPageGoneSchedule(String url, WebPage page,
- long prevFetchTime, long prevModifiedTime, long fetchTime);
+ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
+ long prevModifiedTime, long fetchTime);
/**
- * This method adjusts the fetch schedule if fetching needs to be
- * re-tried due to transient errors. The default implementation
- * sets the next fetch time 1 day in the future and increases the
- * retry counter.Set
- * @param url URL of the page
+ * This method adjusts the fetch schedule if fetching needs to be re-tried
due
+ * to transient errors. The default implementation sets the next fetch time 1
+ * day in the future and increases the retry counter.Set
+ *
+ * @param url
+ * URL of the page
* @param page
- * @param prevFetchTime previous fetch time
- * @param prevModifiedTime previous modified time
- * @param fetchTime current fetch time
+ * @param prevFetchTime
+ * previous fetch time
+ * @param prevModifiedTime
+ * previous modified time
+ * @param fetchTime
+ * current fetch time
*/
public void setPageRetrySchedule(String url, WebPage page,
long prevFetchTime, long prevModifiedTime, long fetchTime);
/**
* Calculates last fetch time of the given CrawlDatum.
+ *
* @return the date as a long.
*/
public long calculateLastFetchTime(WebPage page);
/**
- * This method provides information whether the page is suitable for
- * selection in the current fetchlist. NOTE: a true return value does not
- * guarantee that the page will be fetched, it just allows it to be
- * included in the further selection process based on scores. The default
- * implementation checks <code>fetchTime</code>, if it is higher than the
- * {@param curTime} it returns false, and true otherwise. It will also
- * check that fetchTime is not too remote (more than
<code>maxInterval</code),
- * in which case it lowers the interval and returns true.
- * @param url URL of the page
- * @param row url's row
- * @param curTime reference time (usually set to the time when the
- * fetchlist generation process was started).
+ * This method provides information whether the page is suitable for
selection
+ * in the current fetchlist. NOTE: a true return value does not guarantee
that
+ * the page will be fetched, it just allows it to be included in the further
+ * selection process based on scores. The default implementation checks
+ * <code>fetchTime</code>, if it is higher than the
+ *
+ * @param curTime
+ * it returns false, and true otherwise. It will also check that
+ * fetchTime is not too remote (more than <code>maxInterval</code),
+ * in which case it lowers the interval and returns true.
+ * @param url
+ * URL of the page
+ * @param row
+ * url's row
+ * @param curTime
+ * reference time (usually set to the time when the fetchlist
+ * generation process was started).
* @return true, if the page should be considered for inclusion in the
current
- * fetchlist, otherwise false.
+ * fetchlist, otherwise false.
*/
public boolean shouldFetch(String url, WebPage page, long curTime);
/**
- * This method resets fetchTime, fetchInterval, modifiedTime and
- * page signature, so that it forces refetching.
- * @param url URL of the page
+ * This method resets fetchTime, fetchInterval, modifiedTime and page
+ * signature, so that it forces refetching.
+ *
+ * @param url
+ * URL of the page
* @param page
- * @param asap if true, force refetch as soon as possible - this sets
- * the fetchTime to now. If false, force refetch whenever the next fetch
- * time is set.
+ * @param asap
+ * if true, force refetch as soon as possible - this sets the
+ * fetchTime to now. If false, force refetch whenever the next fetch
+ * time is set.
*/
public void forceRefetch(String url, WebPage row, boolean asap);
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
---
nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
(original)
+++
nutch/branches/2.x/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
Fri Jan 9 06:34:33 2015
@@ -25,20 +25,23 @@ import org.apache.nutch.util.ObjectCache
/** Creates and caches a {@link FetchSchedule} implementation. */
public class FetchScheduleFactory {
- public static final Logger LOG =
LoggerFactory.getLogger(FetchScheduleFactory.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(FetchScheduleFactory.class);
- private FetchScheduleFactory() {} // no public ctor
+ private FetchScheduleFactory() {
+ } // no public ctor
/** Return the FetchSchedule implementation. */
public static FetchSchedule getFetchSchedule(Configuration conf) {
- String clazz = conf.get("db.fetch.schedule.class",
DefaultFetchSchedule.class.getName());
+ String clazz = conf.get("db.fetch.schedule.class",
+ DefaultFetchSchedule.class.getName());
ObjectCache objectCache = ObjectCache.get(conf);
- FetchSchedule impl = (FetchSchedule)objectCache.getObject(clazz);
+ FetchSchedule impl = (FetchSchedule) objectCache.getObject(clazz);
if (impl == null) {
try {
LOG.info("Using FetchSchedule impl: " + clazz);
Class<?> implClass = Class.forName(clazz);
- impl = (FetchSchedule)implClass.newInstance();
+ impl = (FetchSchedule) implClass.newInstance();
impl.setConf(conf);
objectCache.setObject(clazz, impl);
} catch (Exception e) {