Author: lewismc
Date: Thu Jan 29 05:38:59 2015
New Revision: 1655526
URL: http://svn.apache.org/r1655526
Log:
UTCH-865 Format source code in unique style
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java
nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java
nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java
nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java
nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java
nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java
nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java
nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java
nutch/trunk/src/java/org/apache/nutch/indexer/CleaningJob.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexWriter.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexWriters.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingJob.java
nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java
nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java
nutch/trunk/src/java/org/apache/nutch/indexer/NutchIndexAction.java
nutch/trunk/src/java/org/apache/nutch/metadata/CreativeCommons.java
nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java
nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java
nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java
nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java
nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java
nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java
nutch/trunk/src/java/org/apache/nutch/net/package-info.java
nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java
nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java
nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java
nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginList.java
nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseResult.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
nutch/trunk/src/java/org/apache/nutch/parse/Parser.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserNotFound.java
nutch/trunk/src/java/org/apache/nutch/parse/package-info.java
nutch/trunk/src/java/org/apache/nutch/plugin/CircularDependencyException.java
nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java
nutch/trunk/src/java/org/apache/nutch/plugin/ExtensionPoint.java
nutch/trunk/src/java/org/apache/nutch/plugin/MissingDependencyException.java
nutch/trunk/src/java/org/apache/nutch/plugin/Pluggable.java
nutch/trunk/src/java/org/apache/nutch/plugin/Plugin.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginRuntimeException.java
nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolFactory.java
nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolNotFound.java
nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java
nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRules.java
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
nutch/trunk/src/java/org/apache/nutch/protocol/package-info.java
nutch/trunk/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilter.java
nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java
nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java
nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/package-info.java
nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilters.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentPart.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
nutch/trunk/src/java/org/apache/nutch/segment/package-info.java
nutch/trunk/src/java/org/apache/nutch/tools/Benchmark.java
nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java
nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java
nutch/trunk/src/java/org/apache/nutch/tools/package-info.java
nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java
nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java
nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java
nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java
nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java
nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java
nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java
nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java
nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java
nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java
nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java
nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java
nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java
nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java
nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java
nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
nutch/trunk/src/java/org/apache/nutch/util/package-info.java
nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java
nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java
nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java
nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java
nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java
nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java
nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package-info.java
nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/package-info.java
nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java
nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/package-info.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/package-info.java
nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
nutch/trunk/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/trunk/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java
nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java
nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
nutch/trunk/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
nutch/trunk/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/package-info.java
nutch/trunk/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/package-info.java
nutch/trunk/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/package-info.java
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/package-info.java
nutch/trunk/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
nutch/trunk/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
nutch/trunk/src/test/org/apache/nutch/crawl/DummyWritable.java
nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestSignatureFactory.java
nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
nutch/trunk/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java
nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
nutch/trunk/src/test/org/apache/nutch/net/TestURLFilters.java
nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java
nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
nutch/trunk/src/test/org/apache/nutch/parse/TestParseText.java
nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
nutch/trunk/src/test/org/apache/nutch/plugin/HelloWorldExtension.java
nutch/trunk/src/test/org/apache/nutch/plugin/ITestExtension.java
nutch/trunk/src/test/org/apache/nutch/plugin/SimpleTestPlugin.java
nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java
nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
nutch/trunk/src/test/org/apache/nutch/protocol/TestProtocolFactory.java
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/DelayHandler.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/FakeHandler.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/package-info.java
nutch/trunk/src/test/org/apache/nutch/util/TestGZIPUtils.java
nutch/trunk/src/test/org/apache/nutch/util/TestMimeUtil.java
nutch/trunk/src/test/org/apache/nutch/util/TestNodeWalker.java
nutch/trunk/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java
nutch/trunk/src/test/org/apache/nutch/util/TestStringUtil.java
nutch/trunk/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java
nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jan 29 05:38:59 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.10-SNAPSHOT
+* NUTCH-865 Format source code in unique style (lewismc)
+
* NUTCH-1893 Parse-tika failes to parse feed files (Mengying Wang via snagel)
* NUTCH-1920 Upgrade Nutch to use Java 1.7 (lewismc)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Thu
Jan 29 05:38:59 2015
@@ -30,38 +30,43 @@ import org.apache.nutch.crawl.CrawlDatum
*
* @author Andrzej Bialecki
*/
-public abstract class AbstractFetchSchedule extends Configured implements
FetchSchedule {
- private static final Logger LOG =
LoggerFactory.getLogger(AbstractFetchSchedule.class);
-
+public abstract class AbstractFetchSchedule extends Configured implements
+ FetchSchedule {
+ private static final Logger LOG = LoggerFactory
+ .getLogger(AbstractFetchSchedule.class);
+
protected int defaultInterval;
protected int maxInterval;
-
+
public AbstractFetchSchedule() {
super(null);
}
-
+
public AbstractFetchSchedule(Configuration conf) {
super(conf);
}
-
+
public void setConf(Configuration conf) {
super.setConf(conf);
- if (conf == null) return;
+ if (conf == null)
+ return;
defaultInterval = conf.getInt("db.fetch.interval.default", 0);
- maxInterval = conf.getInt("db.fetch.interval.max", 0 );
+ maxInterval = conf.getInt("db.fetch.interval.max", 0);
LOG.info("defaultInterval=" + defaultInterval);
LOG.info("maxInterval=" + maxInterval);
}
-
+
/**
- * Initialize fetch schedule related data. Implementations should at least
- * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
- * implementation sets the <code>fetchTime</code> to now, using the
- * default <code>fetchInterval</code>.
- *
- * @param url URL of the page.
- *
- * @param datum datum instance to be initialized (modified in place).
+ * Initialize fetch schedule related data. Implementations should at least
set
+ * the <code>fetchTime</code> and <code>fetchInterval</code>. The default
+ * implementation sets the <code>fetchTime</code> to now, using the default
+ * <code>fetchInterval</code>.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * datum instance to be initialized (modified in place).
*/
public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
datum.setFetchTime(System.currentTimeMillis());
@@ -69,101 +74,111 @@ public abstract class AbstractFetchSched
datum.setRetriesSinceFetch(0);
return datum;
}
-
+
/**
* Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
- * successfully fetched page. NOTE: this implementation resets the
- * retry counter - extending classes should call super.setFetchSchedule() to
+ * successfully fetched page. NOTE: this implementation resets the retry
+ * counter - extending classes should call super.setFetchSchedule() to
* preserve this behavior.
*/
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
- long prevFetchTime, long prevModifiedTime,
- long fetchTime, long modifiedTime, int state) {
+ long prevFetchTime, long prevModifiedTime, long fetchTime,
+ long modifiedTime, int state) {
datum.setRetriesSinceFetch(0);
return datum;
}
-
+
/**
- * This method specifies how to schedule refetching of pages
- * marked as GONE. Default implementation increases fetchInterval by 50%
- * but the value may never exceed <code>maxInterval</code>.
- *
- * @param url URL of the page.
- *
- * @param datum datum instance to be adjusted.
- *
+ * This method specifies how to schedule refetching of pages marked as GONE.
+ * Default implementation increases fetchInterval by 50% but the value may
+ * never exceed <code>maxInterval</code>.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * datum instance to be adjusted.
+ *
* @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than @see CrawlDatum, but
- * implementations should make sure that it contains at least all
- * information from @see CrawlDatum.
+ * NOTE: this may be a different instance than @see CrawlDatum, but
+ * implementations should make sure that it contains at least all
+ * information from @see CrawlDatum.
*/
public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
- long prevFetchTime, long prevModifiedTime, long fetchTime) {
+ long prevFetchTime, long prevModifiedTime, long fetchTime) {
// no page is truly GONE ... just increase the interval by 50%
// and try much later.
if ((datum.getFetchInterval() * 1.5f) < maxInterval)
datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
else
datum.setFetchInterval(maxInterval * 0.9f);
- datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
+ datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000);
return datum;
}
-
+
/**
- * This method adjusts the fetch schedule if fetching needs to be
- * re-tried due to transient errors. The default implementation
- * sets the next fetch time 1 day in the future and increases
- * the retry counter.
- *
- * @param url URL of the page.
- *
- * @param datum page information.
- *
- * @param prevFetchTime previous fetch time.
- *
- * @param prevModifiedTime previous modified time.
- *
- * @param fetchTime current fetch time.
- *
+ * This method adjusts the fetch schedule if fetching needs to be re-tried
due
+ * to transient errors. The default implementation sets the next fetch time 1
+ * day in the future and increases the retry counter.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * page information.
+ *
+ * @param prevFetchTime
+ * previous fetch time.
+ *
+ * @param prevModifiedTime
+ * previous modified time.
+ *
+ * @param fetchTime
+ * current fetch time.
+ *
* @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than @see CrawlDatum, but
- * implementations should make sure that it contains at least all
- * information from @see CrawlDatum.
+ * NOTE: this may be a different instance than @see CrawlDatum, but
+ * implementations should make sure that it contains at least all
+ * information from @see CrawlDatum.
*/
public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
- long prevFetchTime, long prevModifiedTime, long fetchTime) {
- datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY*1000);
+ long prevFetchTime, long prevModifiedTime, long fetchTime) {
+ datum.setFetchTime(fetchTime + (long) SECONDS_PER_DAY * 1000);
datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
return datum;
}
-
+
/**
* This method return the last fetch time of the CrawlDatum
+ *
* @return the date as a long.
*/
public long calculateLastFetchTime(CrawlDatum datum) {
- return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000;
+ return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000;
}
/**
- * This method provides information whether the page is suitable for
- * selection in the current fetchlist. NOTE: a true return value does not
- * guarantee that the page will be fetched, it just allows it to be
- * included in the further selection process based on scores. The default
- * implementation checks <code>fetchTime</code>, if it is higher than the
- * <code>curTime</code> it returns false, and true otherwise. It will also
- * check that fetchTime is not too remote (more than
<code>maxInterval</code>,
- * in which case it lowers the interval and returns true.
- *
- * @param url URL of the page.
- *
- * @param datum datum instance.
- *
- * @param curTime reference time (usually set to the time when the
- * fetchlist generation process was started).
- *
+ * This method provides information whether the page is suitable for
selection
+ * in the current fetchlist. NOTE: a true return value does not guarantee
that
+ * the page will be fetched, it just allows it to be included in the further
+ * selection process based on scores. The default implementation checks
+ * <code>fetchTime</code>, if it is higher than the <code>curTime</code> it
+ * returns false, and true otherwise. It will also check that fetchTime is
not
+ * too remote (more than <code>maxInterval</code>, in which case it lowers
the
+ * interval and returns true.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * datum instance.
+ *
+ * @param curTime
+ * reference time (usually set to the time when the fetchlist
+ * generation process was started).
+ *
* @return true, if the page should be considered for inclusion in the
current
- * fetchlist, otherwise false.
+ * fetchlist, otherwise false.
*/
public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
// pages are never truly GONE - we have to check them from time to time.
@@ -176,24 +191,27 @@ public abstract class AbstractFetchSched
datum.setFetchTime(curTime);
}
if (datum.getFetchTime() > curTime) {
- return false; // not time yet
+ return false; // not time yet
}
return true;
}
-
+
/**
* This method resets fetchTime, fetchInterval, modifiedTime,
* retriesSinceFetch and page signature, so that it forces refetching.
- *
- * @param url URL of the page.
- *
- * @param datum datum instance.
- *
- * @param asap if true, force refetch as soon as possible - this sets
- * the fetchTime to now. If false, force refetch whenever the next fetch
- * time is set.
+ *
+ * @param url
+ * URL of the page.
+ *
+ * @param datum
+ * datum instance.
+ *
+ * @param asap
+ * if true, force refetch as soon as possible - this sets the
+ * fetchTime to now. If false, force refetch whenever the next fetch
+ * time is set.
*/
- public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) {
+ public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) {
// reduce fetchInterval so that it fits within the max value
if (datum.getFetchInterval() > maxInterval)
datum.setFetchInterval(maxInterval * 0.9f);
@@ -201,7 +219,8 @@ public abstract class AbstractFetchSched
datum.setRetriesSinceFetch(0);
datum.setSignature(null);
datum.setModifiedTime(0L);
- if (asap) datum.setFetchTime(System.currentTimeMillis());
+ if (asap)
+ datum.setFetchTime(System.currentTimeMillis());
return datum;
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Thu
Jan 29 05:38:59 2015
@@ -37,11 +37,12 @@ import org.slf4j.LoggerFactory;
* If SYNC_DELTA property is true, then:
* <ul>
* <li>calculate a <code>delta = fetchTime - modifiedTime</code></li>
- * <li>try to synchronize with the time of change, by shifting the next
fetchTime
- * by a fraction of the difference between the last modification time and the
last
- * fetch time. I.e. the next fetch time will be set to
+ * <li>try to synchronize with the time of change, by shifting the next
+ * fetchTime by a fraction of the difference between the last modification time
+ * and the last fetch time. I.e. the next fetch time will be set to
* <code>fetchTime + fetchInterval - delta * SYNC_DELTA_RATE</code></li>
- * <li>if the adjusted fetch interval is bigger than the delta, then
<code>fetchInterval = delta</code>.</li>
+ * <li>if the adjusted fetch interval is bigger than the delta, then
+ * <code>fetchInterval = delta</code>.</li>
* </ul>
* </li>
* <li>the minimum value of fetchInterval may not be smaller than MIN_INTERVAL
@@ -49,17 +50,21 @@ import org.slf4j.LoggerFactory;
* <li>the maximum value of fetchInterval may not be bigger than MAX_INTERVAL
* (default is 365 days).</li>
* </ul>
- * <p>NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may
destabilize the algorithm,
- * so that the fetch interval either increases or decreases infinitely, with
little
- * relevance to the page changes. Please use {@link #main(String[])} method to
- * test the values before applying them in a production system.</p>
+ * <p>
+ * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize
+ * the algorithm, so that the fetch interval either increases or decreases
+ * infinitely, with little relevance to the page changes. Please use
+ * {@link #main(String[])} method to test the values before applying them in a
+ * production system.
+ * </p>
*
* @author Andrzej Bialecki
*/
public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
// Loggg
- public static final Logger LOG =
LoggerFactory.getLogger(AbstractFetchSchedule.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(AbstractFetchSchedule.class);
protected float INC_RATE;
@@ -68,26 +73,29 @@ public class AdaptiveFetchSchedule exten
private int MAX_INTERVAL;
private int MIN_INTERVAL;
-
+
private boolean SYNC_DELTA;
private double SYNC_DELTA_RATE;
-
+
public void setConf(Configuration conf) {
super.setConf(conf);
- if (conf == null) return;
+ if (conf == null)
+ return;
INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60);
- MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval",
SECONDS_PER_DAY * 365 ); // 1 year
+ MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval",
+ SECONDS_PER_DAY * 365); // 1 year
SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta",
true);
- SYNC_DELTA_RATE =
conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
+ SYNC_DELTA_RATE = conf.getFloat(
+ "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
}
@Override
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
- long prevFetchTime, long prevModifiedTime,
- long fetchTime, long modifiedTime, int state) {
+ long prevFetchTime, long prevModifiedTime, long fetchTime,
+ long modifiedTime, int state) {
super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
fetchTime, modifiedTime, state);
@@ -99,24 +107,27 @@ public class AdaptiveFetchSchedule exten
if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
// Is fetch interval preset in CrawlDatum MD? Then use preset interval
- FloatWritable customIntervalWritable=
(FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
+ FloatWritable customIntervalWritable = (FloatWritable) (datum
+ .getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
interval = customIntervalWritable.get();
} else {
- if (modifiedTime <= 0) modifiedTime = fetchTime;
+ if (modifiedTime <= 0)
+ modifiedTime = fetchTime;
switch (state) {
- case FetchSchedule.STATUS_MODIFIED:
- interval *= (1.0f - DEC_RATE);
- break;
- case FetchSchedule.STATUS_NOTMODIFIED:
- interval *= (1.0f + INC_RATE);
- break;
- case FetchSchedule.STATUS_UNKNOWN:
- break;
+ case FetchSchedule.STATUS_MODIFIED:
+ interval *= (1.0f - DEC_RATE);
+ break;
+ case FetchSchedule.STATUS_NOTMODIFIED:
+ interval *= (1.0f + INC_RATE);
+ break;
+ case FetchSchedule.STATUS_UNKNOWN:
+ break;
}
if (SYNC_DELTA) {
// try to synchronize with the time of change
long delta = (fetchTime - modifiedTime) / 1000L;
- if (delta > interval) interval = delta;
+ if (delta > interval)
+ interval = delta;
refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
}
if (interval < MIN_INTERVAL) {
@@ -154,30 +165,39 @@ public class AdaptiveFetchSchedule exten
// let's move the timeline a couple of deltas
for (int i = 0; i < 10000; i++) {
if (lastModified + update < curTime) {
- //System.out.println("i=" + i + ", lastModified=" + lastModified + ",
update=" + update + ", curTime=" + curTime);
+ // System.out.println("i=" + i + ", lastModified=" + lastModified +
+ // ", update=" + update + ", curTime=" + curTime);
changed = true;
changeCnt++;
lastModified = curTime;
}
- LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() /
delta) + "\tinterval "
- + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t
missed " + miss);
+ LOG.info(i + ". " + changed + "\twill fetch at "
+ + (p.getFetchTime() / delta) + "\tinterval "
+ + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
+ + miss);
if (p.getFetchTime() <= curTime) {
fetchCnt++;
- fs.setFetchSchedule(new Text("http://www.example.com"), p,
- p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
- changed ? FetchSchedule.STATUS_MODIFIED :
FetchSchedule.STATUS_NOTMODIFIED);
- LOG.info("\tfetched & adjusted: " + "\twill fetch at " +
(p.getFetchTime() / delta) + "\tinterval "
- + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
- if (!changed) miss++;
- if (miss > maxMiss) maxMiss = miss;
+ fs.setFetchSchedule(new Text("http://www.example.com"), p, p
+ .getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+ changed ? FetchSchedule.STATUS_MODIFIED
+ : FetchSchedule.STATUS_NOTMODIFIED);
+ LOG.info("\tfetched & adjusted: " + "\twill fetch at "
+ + (p.getFetchTime() / delta) + "\tinterval "
+ + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
+ if (!changed)
+ miss++;
+ if (miss > maxMiss)
+ maxMiss = miss;
changed = false;
totalMiss += miss;
miss = 0;
}
- if (changed) miss++;
+ if (changed)
+ miss++;
curTime += delta;
}
LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
- LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + "
times.");
+ LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
+ + " times.");
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Thu Jan 29
05:38:59 2015
@@ -41,52 +41,51 @@ public class CrawlDatum implements Writa
private static final byte OLD_STATUS_FETCH_SUCCESS = 5;
private static final byte OLD_STATUS_FETCH_RETRY = 6;
private static final byte OLD_STATUS_FETCH_GONE = 7;
-
+
private static HashMap<Byte, Byte> oldToNew = new HashMap<Byte, Byte>();
-
+
/** Page was not fetched yet. */
- public static final byte STATUS_DB_UNFETCHED = 0x01;
+ public static final byte STATUS_DB_UNFETCHED = 0x01;
/** Page was successfully fetched. */
- public static final byte STATUS_DB_FETCHED = 0x02;
+ public static final byte STATUS_DB_FETCHED = 0x02;
/** Page no longer exists. */
- public static final byte STATUS_DB_GONE = 0x03;
+ public static final byte STATUS_DB_GONE = 0x03;
/** Page temporarily redirects to other page. */
- public static final byte STATUS_DB_REDIR_TEMP = 0x04;
+ public static final byte STATUS_DB_REDIR_TEMP = 0x04;
/** Page permanently redirects to other page. */
- public static final byte STATUS_DB_REDIR_PERM = 0x05;
+ public static final byte STATUS_DB_REDIR_PERM = 0x05;
/** Page was successfully fetched and found not modified. */
- public static final byte STATUS_DB_NOTMODIFIED = 0x06;
- public static final byte STATUS_DB_DUPLICATE = 0x07;
-
+ public static final byte STATUS_DB_NOTMODIFIED = 0x06;
+ public static final byte STATUS_DB_DUPLICATE = 0x07;
+
/** Maximum value of DB-related status. */
- public static final byte STATUS_DB_MAX = 0x1f;
-
+ public static final byte STATUS_DB_MAX = 0x1f;
+
/** Fetching was successful. */
- public static final byte STATUS_FETCH_SUCCESS = 0x21;
+ public static final byte STATUS_FETCH_SUCCESS = 0x21;
/** Fetching unsuccessful, needs to be retried (transient errors). */
- public static final byte STATUS_FETCH_RETRY = 0x22;
+ public static final byte STATUS_FETCH_RETRY = 0x22;
/** Fetching temporarily redirected to other page. */
- public static final byte STATUS_FETCH_REDIR_TEMP = 0x23;
+ public static final byte STATUS_FETCH_REDIR_TEMP = 0x23;
/** Fetching permanently redirected to other page. */
- public static final byte STATUS_FETCH_REDIR_PERM = 0x24;
+ public static final byte STATUS_FETCH_REDIR_PERM = 0x24;
/** Fetching unsuccessful - page is gone. */
- public static final byte STATUS_FETCH_GONE = 0x25;
+ public static final byte STATUS_FETCH_GONE = 0x25;
/** Fetching successful - page is not modified. */
public static final byte STATUS_FETCH_NOTMODIFIED = 0x26;
-
+
/** Maximum value of fetch-related status. */
- public static final byte STATUS_FETCH_MAX = 0x3f;
-
+ public static final byte STATUS_FETCH_MAX = 0x3f;
+
/** Page signature. */
- public static final byte STATUS_SIGNATURE = 0x41;
+ public static final byte STATUS_SIGNATURE = 0x41;
/** Page was newly injected. */
- public static final byte STATUS_INJECTED = 0x42;
+ public static final byte STATUS_INJECTED = 0x42;
/** Page discovered through a link. */
- public static final byte STATUS_LINKED = 0x43;
+ public static final byte STATUS_LINKED = 0x43;
/** Page got metadata from a parser */
- public static final byte STATUS_PARSE_META = 0x44;
-
-
+ public static final byte STATUS_PARSE_META = 0x44;
+
public static final HashMap<Byte, String> statNames = new HashMap<Byte,
String>();
static {
statNames.put(STATUS_DB_UNFETCHED, "db_unfetched");
@@ -106,7 +105,7 @@ public class CrawlDatum implements Writa
statNames.put(STATUS_FETCH_GONE, "fetch_gone");
statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
statNames.put(STATUS_PARSE_META, "parse_metadata");
-
+
oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
oldToNew.put(OLD_STATUS_DB_GONE, STATUS_DB_GONE);
@@ -125,22 +124,25 @@ public class CrawlDatum implements Writa
private byte[] signature = null;
private long modifiedTime;
private org.apache.hadoop.io.MapWritable metaData;
-
+
public static boolean hasDbStatus(CrawlDatum datum) {
- if (datum.status <= STATUS_DB_MAX) return true;
+ if (datum.status <= STATUS_DB_MAX)
+ return true;
return false;
}
public static boolean hasFetchStatus(CrawlDatum datum) {
- if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
return true;
+ if (datum.status > STATUS_DB_MAX && datum.status <= STATUS_FETCH_MAX)
+ return true;
return false;
}
- public CrawlDatum() { }
+ public CrawlDatum() {
+ }
public CrawlDatum(int status, int fetchInterval) {
this();
- this.status = (byte)status;
+ this.status = (byte) status;
this.fetchInterval = fetchInterval;
}
@@ -153,26 +155,36 @@ public class CrawlDatum implements Writa
// accessor methods
//
- public byte getStatus() { return status; }
-
+ public byte getStatus() {
+ return status;
+ }
+
public static String getStatusName(byte value) {
String res = statNames.get(value);
- if (res == null) res = "unknown";
+ if (res == null)
+ res = "unknown";
return res;
}
-
- public void setStatus(int status) { this.status = (byte)status; }
+
+ public void setStatus(int status) {
+ this.status = (byte) status;
+ }
/**
* Returns either the time of the last fetch, or the next fetch time,
* depending on whether Fetcher or CrawlDbReducer set the time.
*/
- public long getFetchTime() { return fetchTime; }
+ public long getFetchTime() {
+ return fetchTime;
+ }
+
/**
- * Sets either the time of the last fetch or the next fetch time,
- * depending on whether Fetcher or CrawlDbReducer set the time.
+ * Sets either the time of the last fetch or the next fetch time, depending
on
+ * whether Fetcher or CrawlDbReducer set the time.
*/
- public void setFetchTime(long fetchTime) { this.fetchTime = fetchTime; }
+ public void setFetchTime(long fetchTime) {
+ this.fetchTime = fetchTime;
+ }
public long getModifiedTime() {
return modifiedTime;
@@ -181,20 +193,34 @@ public class CrawlDatum implements Writa
public void setModifiedTime(long modifiedTime) {
this.modifiedTime = modifiedTime;
}
-
- public byte getRetriesSinceFetch() { return retries; }
- public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;}
- public int getFetchInterval() { return fetchInterval; }
+ public byte getRetriesSinceFetch() {
+ return retries;
+ }
+
+ public void setRetriesSinceFetch(int retries) {
+ this.retries = (byte) retries;
+ }
+
+ public int getFetchInterval() {
+ return fetchInterval;
+ }
+
public void setFetchInterval(int fetchInterval) {
this.fetchInterval = fetchInterval;
}
+
public void setFetchInterval(float fetchInterval) {
this.fetchInterval = Math.round(fetchInterval);
}
- public float getScore() { return score; }
- public void setScore(float score) { this.score = score; }
+ public float getScore() {
+ return score;
+ }
+
+ public void setScore(float score) {
+ this.score = score;
+ }
public byte[] getSignature() {
return signature;
@@ -202,33 +228,37 @@ public class CrawlDatum implements Writa
public void setSignature(byte[] signature) {
if (signature != null && signature.length > 256)
- throw new RuntimeException("Max signature length (256) exceeded: " +
signature.length);
+ throw new RuntimeException("Max signature length (256) exceeded: "
+ + signature.length);
this.signature = signature;
}
-
- public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
- this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
- }
-
- /** Add all metadata from other CrawlDatum to this CrawlDatum.
- *
- * @param other CrawlDatum
- */
- public void putAllMetaData(CrawlDatum other) {
- for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
- getMetaData().put(e.getKey(), e.getValue());
- }
- }
+
+ public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable) {
+ this.metaData = new org.apache.hadoop.io.MapWritable(mapWritable);
+ }
/**
- * returns a MapWritable if it was set or read in @see
readFields(DataInput),
- * returns empty map in case CrawlDatum was freshly created (lazily
instantiated).
+ * Add all metadata from other CrawlDatum to this CrawlDatum.
+ *
+ * @param other
+ * CrawlDatum
+ */
+ public void putAllMetaData(CrawlDatum other) {
+ for (Entry<Writable, Writable> e : other.getMetaData().entrySet()) {
+ getMetaData().put(e.getKey(), e.getValue());
+ }
+ }
+
+ /**
+ * returns a MapWritable if it was set or read in @see readFields(DataInput),
+ * returns empty map in case CrawlDatum was freshly created (lazily
+ * instantiated).
*/
public org.apache.hadoop.io.MapWritable getMetaData() {
- if (this.metaData == null) this.metaData = new
org.apache.hadoop.io.MapWritable();
+ if (this.metaData == null)
+ this.metaData = new org.apache.hadoop.io.MapWritable();
return this.metaData;
}
-
//
// writable methods
@@ -241,8 +271,8 @@ public class CrawlDatum implements Writa
}
public void readFields(DataInput in) throws IOException {
- byte version = in.readByte(); // read version
- if (version > CUR_VERSION) // check version
+ byte version = in.readByte(); // read version
+ if (version > CUR_VERSION) // check version
throw new VersionMismatchException(CUR_VERSION, version);
status = in.readByte();
@@ -250,7 +280,8 @@ public class CrawlDatum implements Writa
retries = in.readByte();
if (version > 5) {
fetchInterval = in.readInt();
- } else fetchInterval = Math.round(in.readFloat());
+ } else
+ fetchInterval = Math.round(in.readFloat());
score = in.readFloat();
if (version > 2) {
modifiedTime = in.readLong();
@@ -258,9 +289,10 @@ public class CrawlDatum implements Writa
if (cnt > 0) {
signature = new byte[cnt];
in.readFully(signature);
- } else signature = null;
+ } else
+ signature = null;
}
-
+
if (version > 3) {
boolean hasMetadata = false;
if (version < 7) {
@@ -280,7 +312,8 @@ public class CrawlDatum implements Writa
metaData.readFields(in);
}
}
- if (hasMetadata==false) metaData = null;
+ if (hasMetadata == false)
+ metaData = null;
}
// translate status codes
if (version < 5) {
@@ -288,7 +321,7 @@ public class CrawlDatum implements Writa
status = oldToNew.get(status);
else
status = STATUS_DB_UNFETCHED;
-
+
}
}
@@ -297,7 +330,7 @@ public class CrawlDatum implements Writa
private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8;
public void write(DataOutput out) throws IOException {
- out.writeByte(CUR_VERSION); // store current version
+ out.writeByte(CUR_VERSION); // store current version
out.writeByte(status);
out.writeLong(fetchTime);
out.writeByte(retries);
@@ -328,17 +361,19 @@ public class CrawlDatum implements Writa
this.modifiedTime = that.modifiedTime;
this.signature = that.signature;
if (that.metaData != null) {
- this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); //
make a deep copy
+ this.metaData = new org.apache.hadoop.io.MapWritable(that.metaData); //
make
+ // a
+ //
deep
+ //
copy
} else {
this.metaData = null;
}
}
-
//
// compare methods
//
-
+
/** Sort by decreasing score. */
public int compareTo(CrawlDatum that) {
if (that.score != this.score)
@@ -356,47 +391,49 @@ public class CrawlDatum implements Writa
return SignatureComparator._compare(this, that);
}
- /** A Comparator optimized for CrawlDatum. */
+ /** A Comparator optimized for CrawlDatum. */
public static class Comparator extends WritableComparator {
- public Comparator() { super(CrawlDatum.class); }
+ public Comparator() {
+ super(CrawlDatum.class);
+ }
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- float score1 = readFloat(b1,s1+SCORE_OFFSET);
- float score2 = readFloat(b2,s2+SCORE_OFFSET);
+ float score1 = readFloat(b1, s1 + SCORE_OFFSET);
+ float score2 = readFloat(b2, s2 + SCORE_OFFSET);
if (score2 != score1) {
return (score2 - score1) > 0 ? 1 : -1;
}
- int status1 = b1[s1+1];
- int status2 = b2[s2+1];
+ int status1 = b1[s1 + 1];
+ int status2 = b2[s2 + 1];
if (status2 != status1)
return status1 - status2;
- long fetchTime1 = readLong(b1, s1+1+1);
- long fetchTime2 = readLong(b2, s2+1+1);
+ long fetchTime1 = readLong(b1, s1 + 1 + 1);
+ long fetchTime2 = readLong(b2, s2 + 1 + 1);
if (fetchTime2 != fetchTime1)
return (fetchTime2 - fetchTime1) > 0 ? 1 : -1;
- int retries1 = b1[s1+1+1+8];
- int retries2 = b2[s2+1+1+8];
+ int retries1 = b1[s1 + 1 + 1 + 8];
+ int retries2 = b2[s2 + 1 + 1 + 8];
if (retries2 != retries1)
return retries2 - retries1;
- int fetchInterval1 = readInt(b1, s1+1+1+8+1);
- int fetchInterval2 = readInt(b2, s2+1+1+8+1);
+ int fetchInterval1 = readInt(b1, s1 + 1 + 1 + 8 + 1);
+ int fetchInterval2 = readInt(b2, s2 + 1 + 1 + 8 + 1);
if (fetchInterval2 != fetchInterval1)
return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4);
if (modifiedTime2 != modifiedTime1)
return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1;
- int sigl1 = b1[s1+SIG_OFFSET];
- int sigl2 = b2[s2+SIG_OFFSET];
- return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2,
SIG_OFFSET, sigl2);
+ int sigl1 = b1[s1 + SIG_OFFSET];
+ int sigl2 = b2[s2 + SIG_OFFSET];
+ return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2,
+ SIG_OFFSET, sigl2);
}
}
- static { // register this comparator
+ static { // register this comparator
WritableComparator.define(CrawlDatum.class, new Comparator());
}
-
//
// basic methods
//
@@ -404,12 +441,13 @@ public class CrawlDatum implements Writa
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("Version: " + CUR_VERSION + "\n");
- buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus()) +
")\n");
+ buf.append("Status: " + getStatus() + " (" + getStatusName(getStatus())
+ + ")\n");
buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
- buf.append("Retry interval: " + getFetchInterval() + " seconds (" +
- (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
+ buf.append("Retry interval: " + getFetchInterval() + " seconds ("
+ + (getFetchInterval() / FetchSchedule.SECONDS_PER_DAY) + " days)\n");
buf.append("Score: " + getScore() + "\n");
buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
buf.append("Metadata: \n ");
@@ -424,35 +462,35 @@ public class CrawlDatum implements Writa
}
return buf.toString();
}
-
+
private boolean metadataEquals(org.apache.hadoop.io.MapWritable
otherMetaData) {
- if (metaData==null || metaData.size() ==0) {
+ if (metaData == null || metaData.size() == 0) {
return otherMetaData == null || otherMetaData.size() == 0;
}
if (otherMetaData == null) {
// we already know that the current object is not null or empty
return false;
}
- HashSet<Entry<Writable, Writable>> set1 =
- new HashSet<Entry<Writable,Writable>>(metaData.entrySet());
- HashSet<Entry<Writable, Writable>> set2 =
- new HashSet<Entry<Writable,Writable>>(otherMetaData.entrySet());
+ HashSet<Entry<Writable, Writable>> set1 = new HashSet<Entry<Writable,
Writable>>(
+ metaData.entrySet());
+ HashSet<Entry<Writable, Writable>> set2 = new HashSet<Entry<Writable,
Writable>>(
+ otherMetaData.entrySet());
return set1.equals(set2);
}
public boolean equals(Object o) {
if (!(o instanceof CrawlDatum))
return false;
- CrawlDatum other = (CrawlDatum)o;
- boolean res =
- (this.status == other.status) &&
- (this.fetchTime == other.fetchTime) &&
- (this.modifiedTime == other.modifiedTime) &&
- (this.retries == other.retries) &&
- (this.fetchInterval == other.fetchInterval) &&
- (SignatureComparator._compare(this.signature, other.signature) == 0) &&
- (this.score == other.score);
- if (!res) return res;
+ CrawlDatum other = (CrawlDatum) o;
+ boolean res = (this.status == other.status)
+ && (this.fetchTime == other.fetchTime)
+ && (this.modifiedTime == other.modifiedTime)
+ && (this.retries == other.retries)
+ && (this.fetchInterval == other.fetchInterval)
+ && (SignatureComparator._compare(this.signature, other.signature) == 0)
+ && (this.score == other.score);
+ if (!res)
+ return res;
return metadataEquals(other.metaData);
}
@@ -460,20 +498,14 @@ public class CrawlDatum implements Writa
int res = 0;
if (signature != null) {
for (int i = 0; i < signature.length / 4; i += 4) {
- res ^= (signature[i] << 24 + signature[i+1] << 16 +
- signature[i+2] << 8 + signature[i+3]);
+ res ^= (signature[i] << 24 + signature[i + 1] << 16 + signature[i + 2]
<< 8 + signature[i + 3]);
}
}
if (metaData != null) {
res ^= metaData.entrySet().hashCode();
}
- return
- res ^ status ^
- ((int)fetchTime) ^
- ((int)modifiedTime) ^
- retries ^
- fetchInterval ^
- Float.floatToIntBits(score);
+ return res ^ status ^ ((int) fetchTime) ^ ((int) modifiedTime) ^ retries
+ ^ fetchInterval ^ Float.floatToIntBits(score);
}
public Object clone() {
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Jan 29
05:38:59 2015
@@ -38,8 +38,8 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
/**
- * This class takes the output of the fetcher and updates the
- * crawldb accordingly.
+ * This class takes the output of the fetcher and updates the crawldb
+ * accordingly.
*/
public class CrawlDb extends Configured implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(CrawlDb.class);
@@ -49,21 +49,26 @@ public class CrawlDb extends Configured
public static final String CRAWLDB_PURGE_404 = "db.update.purge.404";
public static final String CURRENT_NAME = "current";
-
+
public static final String LOCK_NAME = ".locked";
-
- public CrawlDb() {}
-
+
+ public CrawlDb() {
+ }
+
public CrawlDb(Configuration conf) {
setConf(conf);
}
- public void update(Path crawlDb, Path[] segments, boolean normalize, boolean
filter) throws IOException {
- boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
true);
+ public void update(Path crawlDb, Path[] segments, boolean normalize,
+ boolean filter) throws IOException {
+ boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
+ true);
update(crawlDb, segments, normalize, filter, additionsAllowed, false);
}
-
- public void update(Path crawlDb, Path[] segments, boolean normalize, boolean
filter, boolean additionsAllowed, boolean force) throws IOException {
+
+ public void update(Path crawlDb, Path[] segments, boolean normalize,
+ boolean filter, boolean additionsAllowed, boolean force)
+ throws IOException {
FileSystem fs = FileSystem.get(getConf());
Path lock = new Path(crawlDb, LOCK_NAME);
LockUtil.createLockFile(fs, lock, force);
@@ -106,22 +111,24 @@ public class CrawlDb extends Configured
} catch (IOException e) {
LockUtil.removeLockFile(fs, lock);
Path outPath = FileOutputFormat.getOutputPath(job);
- if (fs.exists(outPath) ) fs.delete(outPath, true);
+ if (fs.exists(outPath))
+ fs.delete(outPath, true);
throw e;
}
CrawlDb.install(job, crawlDb);
long end = System.currentTimeMillis();
- LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
+ TimingUtil.elapsedTime(start, end));
+ LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
-/*
- * Configure a new CrawlDb in a temp folder at crawlDb/<rand>
- */
+
+ /*
+ * Configure a new CrawlDb in a temp folder at crawlDb/<rand>
+ */
public static JobConf createJob(Configuration config, Path crawlDb)
- throws IOException {
- Path newCrawlDb =
- new Path(crawlDb,
- Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ throws IOException {
+ Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random()
+ .nextInt(Integer.MAX_VALUE)));
JobConf job = new NutchJob(config);
job.setJobName("crawldb " + crawlDb);
@@ -154,12 +161,14 @@ public class CrawlDb extends Configured
Path old = new Path(crawlDb, "old");
Path current = new Path(crawlDb, CURRENT_NAME);
if (fs.exists(current)) {
- if (fs.exists(old)) fs.delete(old, true);
+ if (fs.exists(old))
+ fs.delete(old, true);
fs.rename(current, old);
}
fs.mkdirs(crawlDb);
fs.rename(newCrawlDb, current);
- if (!preserveBackup && fs.exists(old)) fs.delete(old, true);
+ if (!preserveBackup && fs.exists(old))
+ fs.delete(old, true);
Path lock = new Path(crawlDb, LOCK_NAME);
LockUtil.removeLockFile(fs, lock);
}
@@ -171,20 +180,29 @@ public class CrawlDb extends Configured
public int run(String[] args) throws Exception {
if (args.length < 1) {
- System.err.println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1>
<seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]");
+ System.err
+ .println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2>
...) [-force] [-normalize] [-filter] [-noAdditions]");
System.err.println("\tcrawldb\tCrawlDb to update");
- System.err.println("\t-dir segments\tparent directory containing all
segments to update from");
- System.err.println("\tseg1 seg2 ...\tlist of segment names to update
from");
- System.err.println("\t-force\tforce update even if CrawlDb appears to be
locked (CAUTION advised)");
- System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb
and segment (usually not needed)");
- System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and
segment");
- System.err.println("\t-noAdditions\tonly update already existing URLs,
don't add any newly discovered URLs");
+ System.err
+ .println("\t-dir segments\tparent directory containing all segments
to update from");
+ System.err
+ .println("\tseg1 seg2 ...\tlist of segment names to update from");
+ System.err
+ .println("\t-force\tforce update even if CrawlDb appears to be
locked (CAUTION advised)");
+ System.err
+ .println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and
segment (usually not needed)");
+ System.err
+ .println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
+ System.err
+ .println("\t-noAdditions\tonly update already existing URLs, don't
add any newly discovered URLs");
return -1;
}
- boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING,
false);
+ boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING,
+ false);
boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
- boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
true);
+ boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED,
+ true);
boolean force = false;
final FileSystem fs = FileSystem.get(getConf());
HashSet<Path> dirs = new HashSet<Path>();
@@ -198,14 +216,16 @@ public class CrawlDb extends Configured
} else if (args[i].equals("-noAdditions")) {
additionsAllowed = false;
} else if (args[i].equals("-dir")) {
- FileStatus[] paths = fs.listStatus(new Path(args[++i]),
HadoopFSUtil.getPassDirectoriesFilter(fs));
+ FileStatus[] paths = fs.listStatus(new Path(args[++i]),
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
} else {
dirs.add(new Path(args[i]));
}
}
try {
- update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]),
normalize, filter, additionsAllowed, force);
+ update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize,
+ filter, additionsAllowed, force);
return 0;
} catch (Exception e) {
LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbFilter.java Thu Jan 29
05:38:59 2015
@@ -30,12 +30,13 @@ import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
/**
- * This class provides a way to separate the URL normalization
- * and filtering steps from the rest of CrawlDb manipulation code.
+ * This class provides a way to separate the URL normalization and filtering
+ * steps from the rest of CrawlDb manipulation code.
*
* @author Andrzej Bialecki
*/
-public class CrawlDbFilter implements Mapper<Text, CrawlDatum, Text,
CrawlDatum> {
+public class CrawlDbFilter implements
+ Mapper<Text, CrawlDatum, Text, CrawlDatum> {
public static final String URL_FILTERING = "crawldb.url.filters";
public static final String URL_NORMALIZING = "crawldb.url.normalizers";
@@ -51,7 +52,7 @@ public class CrawlDbFilter implements Ma
private URLFilters filters;
private URLNormalizers normalizers;
-
+
private String scope;
public static final Logger LOG =
LoggerFactory.getLogger(CrawlDbFilter.class);
@@ -70,17 +71,19 @@ public class CrawlDbFilter implements Ma
}
}
- public void close() {}
-
+ public void close() {
+ }
+
private Text newKey = new Text();
public void map(Text key, CrawlDatum value,
- OutputCollector<Text, CrawlDatum> output,
- Reporter reporter) throws IOException {
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+ throws IOException {
String url = key.toString();
- // https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
cheaper than normalizing or filtering
+ // https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
+ // cheaper than normalizing or filtering
if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
url = null;
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Thu Jan 29
05:38:59 2015
@@ -39,36 +39,42 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
/**
- * This tool merges several CrawlDb-s into one, optionally filtering
- * URLs through the current URLFilters, to skip prohibited
- * pages.
+ * This tool merges several CrawlDb-s into one, optionally filtering URLs
+ * through the current URLFilters, to skip prohibited pages.
*
- * <p>It's possible to use this tool just for filtering - in that case
- * only one CrawlDb should be specified in arguments.</p>
- * <p>If more than one CrawlDb contains information about the same URL,
- * only the most recent version is retained, as determined by the
- * value of {@link org.apache.nutch.crawl.CrawlDatum#getFetchTime()}.
- * However, all metadata information from all versions is accumulated,
- * with newer values taking precedence over older values.
+ * <p>
+ * It's possible to use this tool just for filtering - in that case only one
+ * CrawlDb should be specified in arguments.
+ * </p>
+ * <p>
+ * If more than one CrawlDb contains information about the same URL, only the
+ * most recent version is retained, as determined by the value of
+ * {@link org.apache.nutch.crawl.CrawlDatum#getFetchTime()}. However, all
+ * metadata information from all versions is accumulated, with newer values
+ * taking precedence over older values.
*
* @author Andrzej Bialecki
*/
public class CrawlDbMerger extends Configured implements Tool {
- private static final Logger LOG =
LoggerFactory.getLogger(CrawlDbMerger.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(CrawlDbMerger.class);
- public static class Merger extends MapReduceBase implements Reducer<Text,
CrawlDatum, Text, CrawlDatum> {
+ public static class Merger extends MapReduceBase implements
+ Reducer<Text, CrawlDatum, Text, CrawlDatum> {
private org.apache.hadoop.io.MapWritable meta;
private CrawlDatum res = new CrawlDatum();
private FetchSchedule schedule;
- public void close() throws IOException {}
+ public void close() throws IOException {
+ }
public void configure(JobConf conf) {
schedule = FetchScheduleFactory.getFetchSchedule(conf);
}
- public void reduce(Text key, Iterator<CrawlDatum> values,
OutputCollector<Text, CrawlDatum> output, Reporter reporter)
- throws IOException {
+ public void reduce(Text key, Iterator<CrawlDatum> values,
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+ throws IOException {
long resTime = 0L;
boolean resSet = false;
meta = new org.apache.hadoop.io.MapWritable();
@@ -91,7 +97,7 @@ public class CrawlDbMerger extends Confi
meta.put(e.getKey(), e.getValue());
}
res.set(val);
- resTime = valTime ;
+ resTime = valTime;
} else {
// insert older metadata before newer
for (Entry<Writable, Writable> e : meta.entrySet()) {
@@ -104,37 +110,44 @@ public class CrawlDbMerger extends Confi
output.collect(key, res);
}
}
-
+
public CrawlDbMerger() {
-
+
}
-
+
public CrawlDbMerger(Configuration conf) {
setConf(conf);
}
- public void merge(Path output, Path[] dbs, boolean normalize, boolean
filter) throws Exception {
+ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter)
+ throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("CrawlDb merge: starting at " + sdf.format(start));
JobConf job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
- if (LOG.isInfoEnabled()) { LOG.info("Adding " + dbs[i]); }
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Adding " + dbs[i]);
+ }
FileInputFormat.addInputPath(job, new Path(dbs[i],
CrawlDb.CURRENT_NAME));
}
JobClient.runJob(job);
FileSystem fs = FileSystem.get(getConf());
- if(fs.exists(output))
- fs.delete(output,true);
+ if (fs.exists(output))
+ fs.delete(output, true);
fs.mkdirs(output);
- fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
CrawlDb.CURRENT_NAME));
+ fs.rename(FileOutputFormat.getOutputPath(job), new Path(output,
+ CrawlDb.CURRENT_NAME));
long end = System.currentTimeMillis();
- LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: " +
TimingUtil.elapsedTime(start, end));
+ LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
}
- public static JobConf createMergeJob(Configuration conf, Path output,
boolean normalize, boolean filter) {
- Path newCrawlDb = new Path("crawldb-merge-" + Integer.toString(new
Random().nextInt(Integer.MAX_VALUE)));
+ public static JobConf createMergeJob(Configuration conf, Path output,
+ boolean normalize, boolean filter) {
+ Path newCrawlDb = new Path("crawldb-merge-"
+ + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf job = new NutchJob(conf);
job.setJobName("crawldb merge " + output);
@@ -158,16 +171,20 @@ public class CrawlDbMerger extends Confi
* @param args
*/
public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(),
args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(),
+ args);
System.exit(res);
}
-
+
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.err.println("Usage: CrawlDbMerger <output_crawldb> <crawldb1>
[<crawldb2> <crawldb3> ...] [-normalize] [-filter]");
+ System.err
+ .println("Usage: CrawlDbMerger <output_crawldb> <crawldb1>
[<crawldb2> <crawldb3> ...] [-normalize] [-filter]");
System.err.println("\toutput_crawldb\toutput CrawlDb");
- System.err.println("\tcrawldb1 ...\tinput CrawlDb-s (single input
CrawlDb is ok)");
- System.err.println("\t-normalize\tuse URLNormalizer on urls in the
crawldb(s) (usually not needed)");
+ System.err
+ .println("\tcrawldb1 ...\tinput CrawlDb-s (single input CrawlDb is
ok)");
+ System.err
+ .println("\t-normalize\tuse URLNormalizer on urls in the crawldb(s)
(usually not needed)");
System.err.println("\t-filter\tuse URLFilters on urls in the
crawldb(s)");
return -1;
}
@@ -185,8 +202,8 @@ public class CrawlDbMerger extends Confi
continue;
}
final Path dbPath = new Path(args[i]);
- if(fs.exists(dbPath))
- dbs.add(dbPath);
+ if (fs.exists(dbPath))
+ dbs.add(dbPath);
}
try {
merge(output, dbs.toArray(new Path[dbs.size()]), normalize, filter);