This is an automated email from the ASF dual-hosted git repository. jnioche pushed a commit to branch 851 in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git
commit d2ef5a1fdfa2f3b4d96082e652579ade8db46eb0 Merge: 27414e98 bdc34cbc Author: Julien Nioche <[email protected]> AuthorDate: Thu Mar 28 15:32:25 2024 +0000 Merge branch 'main' into 851 .github/workflows/code_coverage.yml | 29 ++ .github/workflows/maven.yml | 6 +- DISCLAIMER | 10 + NOTICE | 4 +- README.md | 36 +- THIRD-PARTY.properties | 4 + THIRD-PARTY.txt | 547 +++++++++++++++++++++ archetype/pom.xml | 6 +- .../META-INF/maven/archetype-metadata.xml | 71 +-- .../main/resources/archetype-resources/README.md | 2 +- .../archetype-resources/crawler-conf.yaml | 58 ++- .../resources/archetype-resources/crawler.flux | 20 +- .../src/main/resources/archetype-resources/pom.xml | 24 +- .../src/main/java/CrawlTopology.java | 24 +- .../src/main/resources/jsoupfilters.json | 6 +- .../src/main/resources/parsefilters.json | 8 +- .../src/main/resources/urlfilters.json | 18 +- core/pom.xml | 54 +- .../stormcrawler/protocol/Protocol.java | 40 -- .../selenium/DelegatorRemoteDriverProtocol.java | 81 --- .../protocol/selenium/RemoteDriverProtocol.java | 87 ---- .../apache}/stormcrawler/ConfigurableTopology.java | 6 +- .../apache}/stormcrawler/Constants.java | 2 +- .../apache}/stormcrawler/JSONResource.java | 2 +- .../apache}/stormcrawler/Metadata.java | 20 +- .../apache}/stormcrawler/bolt/FeedParserBolt.java | 28 +- .../apache}/stormcrawler/bolt/FetcherBolt.java | 108 ++-- .../apache}/stormcrawler/bolt/JSoupParserBolt.java | 48 +- .../stormcrawler/bolt/SimpleFetcherBolt.java | 45 +- .../stormcrawler/bolt/SiteMapParserBolt.java | 28 +- .../stormcrawler/bolt/StatusEmitterBolt.java | 26 +- .../apache}/stormcrawler/bolt/URLFilterBolt.java | 12 +- .../stormcrawler/bolt/URLPartitionerBolt.java | 8 +- .../apache}/stormcrawler/filtering/URLFilter.java | 6 +- .../apache}/stormcrawler/filtering/URLFilters.java | 84 +++- .../filtering/basic/BasicURLFilter.java | 6 +- .../filtering/basic/BasicURLNormalizer.java | 23 +- .../filtering/basic/SelfURLFilter.java | 6 +- .../filtering/depth/MaxDepthFilter.java | 8 +- .../stormcrawler/filtering/host/HostURLFilter.java | 6 +- .../filtering/metadata/MetadataFilter.java | 6 +- .../filtering/regex/FastURLFilter.java | 10 +- .../stormcrawler/filtering/regex/RegexRule.java | 2 +- .../filtering/regex/RegexURLFilter.java | 2 +- .../filtering/regex/RegexURLFilterBase.java | 6 +- .../filtering/regex/RegexURLNormalizer.java | 6 +- .../filtering/robots/RobotsFilter.java | 14 +- .../filtering/sitemap/SitemapFilter.java | 10 +- .../stormcrawler/indexing/AbstractIndexerBolt.java | 143 ++++-- .../stormcrawler/indexing/DummyIndexer.java | 8 +- .../stormcrawler/indexing/StdOutIndexer.java | 8 +- .../stormcrawler/jsoup/LDJsonParseFilter.java | 12 +- .../stormcrawler/jsoup/LinkParseFilter.java | 20 +- .../apache}/stormcrawler/jsoup/XPathFilter.java | 12 +- .../parse/DocumentFragmentBuilder.java | 2 +- .../apache}/stormcrawler/parse/JSoupFilter.java | 7 +- .../apache}/stormcrawler/parse/JSoupFilters.java | 10 +- .../apache}/stormcrawler/parse/Outlink.java | 4 +- .../apache}/stormcrawler/parse/ParseData.java | 4 +- .../apache}/stormcrawler/parse/ParseFilter.java | 8 +- .../apache}/stormcrawler/parse/ParseFilters.java | 8 +- .../apache}/stormcrawler/parse/ParseResult.java | 4 +- .../apache}/stormcrawler/parse/TextExtractor.java | 4 +- .../parse/filter/CollectionTagger.java | 10 +- .../CommaSeparatedToMultivaluedMetadata.java | 8 +- .../parse/filter/DebugParseFilter.java | 6 +- .../parse/filter/DomainParseFilter.java | 12 +- .../parse/filter/LDJsonParseFilter.java | 10 +- .../stormcrawler/parse/filter/LinkParseFilter.java | 20 +- .../parse/filter/MD5SignatureParseFilter.java | 10 +- .../parse/filter/MimeTypeNormalization.java | 8 +- .../stormcrawler/parse/filter/XPathFilter.java | 10 +- .../persistence/AbstractQueryingSpout.java | 9 +- .../persistence/AbstractStatusUpdaterBolt.java | 10 +- .../persistence/AdaptiveScheduler.java | 22 +- .../stormcrawler/persistence/DefaultScheduler.java | 14 +- .../persistence/EmptyQueueListener.java | 2 +- .../persistence/MemoryStatusUpdater.java | 6 +- .../stormcrawler/persistence/Scheduler.java | 8 +- .../apache}/stormcrawler/persistence/Status.java | 2 +- .../persistence/StdOutStatusUpdater.java | 4 +- .../persistence/urlbuffer/AbstractURLBuffer.java | 8 +- .../persistence/urlbuffer/PriorityURLBuffer.java | 4 +- .../persistence/urlbuffer/SchedulingURLBuffer.java | 4 +- .../persistence/urlbuffer/SimpleURLBuffer.java | 2 +- .../persistence/urlbuffer/URLBuffer.java | 12 +- .../protocol/AbstractHttpProtocol.java | 118 +---- .../stormcrawler/protocol/DelegatorProtocol.java | 159 ++++-- .../apache}/stormcrawler/protocol/HttpHeaders.java | 2 +- .../protocol/HttpRobotRulesParser.java | 87 +++- .../org/apache/stormcrawler/protocol/Protocol.java | 156 ++++++ .../stormcrawler/protocol/ProtocolFactory.java | 17 +- .../stormcrawler/protocol/ProtocolResponse.java | 10 +- .../apache}/stormcrawler/protocol/RobotRules.java | 2 +- .../stormcrawler/protocol/RobotRulesParser.java | 84 +++- .../stormcrawler/protocol/file/FileProtocol.java | 16 +- .../stormcrawler/protocol/file/FileResponse.java | 8 +- .../protocol/httpclient/HttpProtocol.java | 25 +- .../protocol/okhttp/DNSResolutionListener.java | 2 +- .../stormcrawler/protocol/okhttp/HttpProtocol.java | 24 +- .../protocol/selenium/NavigationFilter.java | 8 +- .../protocol/selenium/NavigationFilters.java | 14 +- .../protocol/selenium/RemoteDriverProtocol.java | 131 +++++ .../protocol/selenium/SeleniumProtocol.java | 26 +- .../stormcrawler/proxy/MultiProxyManager.java | 8 +- .../apache}/stormcrawler/proxy/ProxyManager.java | 4 +- .../apache}/stormcrawler/proxy/SCProxy.java | 14 +- .../stormcrawler/proxy/SingleProxyManager.java | 13 +- .../apache}/stormcrawler/spout/FileSpout.java | 32 +- .../apache}/stormcrawler/spout/MemorySpout.java | 10 +- .../stormcrawler/util/AbstractConfigurable.java | 2 +- .../stormcrawler/util/CharsetIdentification.java | 6 +- .../stormcrawler/util/CollectionMetric.java | 2 +- .../apache}/stormcrawler/util/ConfUtils.java | 83 +++- .../apache}/stormcrawler/util/Configurable.java | 2 +- .../stormcrawler/util/ConfigurableHelper.java | 2 +- .../apache}/stormcrawler/util/CookieConverter.java | 2 +- .../stormcrawler/util/InitialisationUtil.java | 2 +- .../stormcrawler/util/MetadataTransfer.java | 30 +- .../stormcrawler/util/PerSecondReducer.java | 2 +- .../apache}/stormcrawler/util/RefreshTag.java | 2 +- .../apache}/stormcrawler/util/RobotsTags.java | 4 +- .../apache}/stormcrawler/util/StringTabScheme.java | 4 +- .../apache}/stormcrawler/util/URLPartitioner.java | 28 +- .../stormcrawler/util/URLStreamGrouping.java | 8 +- .../apache}/stormcrawler/util/URLUtil.java | 2 +- core/src/main/resources/crawler-default.yaml | 97 +++- .../apache/stormcrawler/MetadataTest.java} | 26 +- .../stormcrawler/TestMetadataSerialization.java | 2 +- .../apache}/stormcrawler/TestOutputCollector.java | 2 +- .../apache}/stormcrawler/TestUtil.java | 2 +- .../stormcrawler/bolt/AbstractFetcherBoltTest.java | 12 +- .../stormcrawler/bolt/FeedParserBoltTest.java | 16 +- .../apache}/stormcrawler/bolt/FetcherBoltTest.java | 2 +- .../stormcrawler/bolt/JSoupParserBoltTest.java | 16 +- .../stormcrawler/bolt/SimpleFetcherBoltTest.java | 2 +- .../stormcrawler/bolt/SiteMapParserBoltTest.java | 95 ++-- .../stormcrawler/filtering/BasicURLFilterTest.java | 6 +- .../filtering/BasicURLNormalizerTest.java | 22 +- .../stormcrawler/filtering/FastURLFilterTest.java | 6 +- .../stormcrawler/filtering/HostURLFilterTest.java | 6 +- .../stormcrawler/filtering/MaxDepthFilterTest.java | 8 +- .../stormcrawler/filtering/MetadataFilterTest.java | 6 +- .../stormcrawler/filtering/RegexFilterTest.java | 6 +- .../ClassInheritingFomAbstractAndInterface.java | 6 +- .../ClassInheritingFromAbstractClassOnly.java | 4 +- .../ClassInheritingFromOpenClass.java | 4 +- .../ClassWithoutValidConstructor.java | 4 +- .../initialisation/FinalClassToInitialize.java | 2 +- .../helper/initialisation/SimpleOpenClass.java | 2 +- .../helper/initialisation/base/AbstractClass.java | 2 +- .../helper/initialisation/base/ITestInterface.java | 2 +- .../OpenClassWithAbstractClassAndInterface.java | 2 +- .../stormcrawler/indexer/BasicIndexingTest.java | 27 +- .../apache}/stormcrawler/indexer/DummyIndexer.java | 6 +- .../stormcrawler/indexer/IndexerTester.java | 10 +- .../apache}/stormcrawler/json/JsoupFilterTest.java | 10 +- .../stormcrawler/jsoup/JSoupFiltersTest.java | 10 +- .../stormcrawler/parse/DuplicateLinksTest.java | 10 +- .../apache}/stormcrawler/parse/ParsingTester.java | 8 +- .../stormcrawler/parse/StackOverflowTest.java | 8 +- .../stormcrawler/parse/TextExtractorTest.java | 2 +- .../parse/filter/CSVMetadataFilterTest.java | 8 +- .../parse/filter/CollectionTaggerTest.java | 4 +- .../parse/filter/SubDocumentsFilterTest.java | 8 +- .../parse/filter/SubDocumentsParseFilter.java | 8 +- .../stormcrawler/parse/filter/XPathFilterTest.java | 8 +- .../persistence/AdaptiveSchedulerTest.java | 19 +- .../persistence/DefaultSchedulerTest.java | 4 +- .../stormcrawler/persistence/URLBufferTest.java | 10 +- .../protocol/AbstractProtocolTest.java | 96 ++++ .../protocol/DelegationProtocolTest.java | 41 +- .../stormcrawler/protocol/DummyProtocol.java} | 28 +- .../stormcrawler/protocol/HttpHeadersTest.java | 2 +- .../protocol/HttpRobotRulesParserTest.java | 282 +++++++++++ .../protocol/selenium/ProtocolTest.java | 166 +++++++ .../stormcrawler/proxy/MultiProxyManagerTest.java | 2 +- .../apache}/stormcrawler/proxy/SCProxyTest.java | 2 +- .../stormcrawler/proxy/SingleProxyManagerTest.java | 2 +- .../apache/stormcrawler/util/ConfUtilsTest.java | 64 +++ .../stormcrawler/util/CookieConverterTest.java | 2 +- .../stormcrawler/util/InitialisationUtilTest.java | 6 +- .../stormcrawler/util/MetadataTransferTest.java | 61 ++- .../apache}/stormcrawler/util/RefreshTagTest.java | 2 +- .../apache}/stormcrawler/util/RobotsTagsTest.java | 4 +- core/src/test/resources/basicurlnormalizer.json | 4 +- core/src/test/resources/delegator-conf.yaml | 21 +- core/src/test/resources/test.jsoupfilters.json | 8 +- core/src/test/resources/test.parsefilters.json | 8 +- core/src/test/resources/test.subdocfilter.json | 6 +- .../test/resources/tripadvisor.sitemap.index.xml | 22 + core/src/test/resources/tripadvisor.sitemap.xml.gz | Bin 0 -> 1537978 bytes external/aws/README.md | 2 +- external/aws/pom.xml | 8 +- .../aws/bolt/CloudSearchConstants.java | 2 +- .../aws/bolt/CloudSearchIndexerBolt.java | 12 +- .../stormcrawler/aws/bolt/CloudSearchUtils.java | 2 +- .../stormcrawler/aws/s3/AbstractS3CacheBolt.java | 4 +- .../stormcrawler/aws/s3/S3CacheChecker.java | 6 +- .../apache}/stormcrawler/aws/s3/S3Cacher.java | 6 +- .../stormcrawler/aws/s3/S3ContentCacher.java | 4 +- external/elasticsearch/README.md | 20 +- external/elasticsearch/archetype/pom.xml | 4 +- .../META-INF/maven/archetype-metadata.xml | 35 +- .../main/resources/archetype-resources/README.md | 6 +- .../archetype-resources/crawler-conf.yaml | 58 ++- .../resources/archetype-resources/es-conf.yaml | 2 +- .../resources/archetype-resources/es-crawler.flux | 52 +- .../archetype-resources/es-injection.flux | 50 ++ .../archetype-resources/kibana/importKibana.sh | 8 +- .../src/main/resources/archetype-resources/pom.xml | 24 +- .../src/main/java/ESCrawlTopology.java | 36 +- .../src/main/resources/jsoupfilters.json | 6 +- .../src/main/resources/parsefilters.json | 8 +- .../src/main/resources/urlfilters.json | 18 +- external/elasticsearch/pom.xml | 9 +- .../BulkItemResponseToFailedFlag.java | 2 +- .../elasticsearch/ElasticSearchConnection.java | 4 +- .../elasticsearch/bolt/DeletionBolt.java | 23 +- .../elasticsearch/bolt/IndexerBolt.java | 20 +- .../filtering/JSONURLFilterWrapper.java | 14 +- .../elasticsearch/metrics/MetricsConsumer.java | 8 +- .../elasticsearch/metrics/StatusMetricsBolt.java | 6 +- .../parse/filter/JSONResourceWrapper.java | 14 +- .../elasticsearch/persistence/AbstractSpout.java | 10 +- .../persistence/AggregationSpout.java | 8 +- .../elasticsearch/persistence/CollapsingSpout.java | 4 +- .../elasticsearch/persistence/HybridSpout.java | 6 +- .../elasticsearch/persistence/ScrollSpout.java | 10 +- .../persistence/StatusUpdaterBolt.java | 50 +- .../elasticsearch/bolt/IndexerBoltTest.java | 12 +- .../elasticsearch/bolt/StatusBoltTest.java | 14 +- external/langid/pom.xml | 6 +- .../stormcrawler/parse/filter/LanguageID.java | 12 +- external/opensearch/OS_IndexInit.sh | 23 - external/opensearch/README.md | 19 +- external/opensearch/archetype/pom.xml | 4 +- .../META-INF/archetype-post-generate.groovy | 5 +- .../META-INF/maven/archetype-metadata.xml | 37 +- .../resources/archetype-resources/OS_IndexInit.sh | 25 + .../main/resources/archetype-resources/README.md | 17 +- .../archetype-resources/crawler-conf.yaml | 58 ++- .../resources/archetype-resources/crawler.flux | 50 +- .../dashboards/importDashboards.sh | 8 +- .../resources/archetype-resources/injection.flux | 50 ++ .../archetype-resources/opensearch-conf.yaml | 12 +- .../src/main/resources/archetype-resources/pom.xml | 24 +- .../src/main/resources/indexer.mapping} | 0 .../src/main/resources/jsoupfilters.json | 6 +- .../src/main/resources/metrics.mapping | 0 .../src/main/resources/parsefilters.json | 8 +- .../src/main/resources/status.mapping | 0 .../src/main/resources/urlfilters.json | 18 +- external/opensearch/opensearch-conf.yaml | 12 +- external/opensearch/pom.xml | 24 +- .../stormcrawler/opensearch/bolt/DeletionBolt.java | 94 ---- .../opensearch/BulkItemResponseToFailedFlag.java | 10 +- .../apache}/stormcrawler/opensearch/Constants.java | 2 +- .../stormcrawler/opensearch/IndexCreation.java | 15 +- .../opensearch/OpenSearchConnection.java} | 102 ++-- .../stormcrawler/opensearch/bolt/DeletionBolt.java | 308 ++++++++++++ .../stormcrawler/opensearch/bolt/IndexerBolt.java | 59 +-- .../opensearch/filtering/JSONURLFilterWrapper.java | 16 +- .../opensearch/metrics/MetricsConsumer.java | 26 +- .../opensearch/metrics/StatusMetricsBolt.java | 20 +- .../parse/filter/JSONResourceWrapper.java | 38 +- .../opensearch/persistence/AbstractSpout.java | 78 +-- .../opensearch/persistence/AggregationSpout.java | 18 +- .../opensearch/persistence/HybridSpout.java | 22 +- .../opensearch/persistence/StatusUpdaterBolt.java | 81 +-- .../opensearch/bolt/AbstractOpenSearchTest.java | 46 ++ .../opensearch/bolt/IndexerBoltTest.java | 30 +- .../opensearch/bolt/StatusBoltTest.java | 38 +- .../resources/indexer.mapping} | 0 .../src/{main => test}/resources/metrics.mapping | 0 .../src/{main => test}/resources/status.mapping | 0 external/pom.xml | 23 +- external/solr/README.md | 2 +- external/solr/cores/status/conf/schema.xml | 2 +- external/solr/pom.xml | 14 +- external/solr/solr-conf.yaml | 2 +- .../apache}/stormcrawler/solr/SeedInjector.java | 10 +- .../apache}/stormcrawler/solr/SolrConnection.java | 4 +- .../stormcrawler/solr/SolrCrawlTopology.java | 26 +- .../stormcrawler/solr/bolt/DeletionBolt.java | 86 ++++ .../stormcrawler/solr/bolt/IndexerBolt.java | 13 +- .../stormcrawler/solr/metrics/MetricsConsumer.java | 6 +- .../stormcrawler/solr/persistence/SolrSpout.java | 11 +- .../solr/persistence/StatusUpdaterBolt.java | 17 +- external/sql/pom.xml | 6 +- external/sql/sql-conf.yaml | 2 +- .../apache}/stormcrawler/sql/Constants.java | 2 +- .../apache}/stormcrawler/sql/IndexerBolt.java | 12 +- .../apache}/stormcrawler/sql/SQLSpout.java | 10 +- .../apache}/stormcrawler/sql/SQLUtil.java | 2 +- .../stormcrawler/sql/StatusUpdaterBolt.java | 33 +- .../stormcrawler/sql/metrics/MetricsConsumer.java | 18 +- external/tika/README.md | 4 +- external/tika/pom.xml | 12 +- .../apache}/stormcrawler/tika/DOMBuilder.java | 2 +- .../apache}/stormcrawler/tika/ParserBolt.java | 38 +- .../apache}/stormcrawler/tika/RedirectionBolt.java | 4 +- .../stormcrawler/tika/XMLCharacterRecognizer.java | 2 +- .../apache}/stormcrawler/tika/ParserBoltTest.java | 16 +- external/urlfrontier/README.md | 2 +- external/urlfrontier/pom.xml | 9 +- .../stormcrawler/urlfrontier/Constants.java | 2 +- .../urlfrontier/ManagedChannelUtil.java | 4 +- .../apache}/stormcrawler/urlfrontier/Spout.java | 10 +- .../urlfrontier/StatusUpdaterBolt.java | 14 +- .../urlfrontier/StatusUpdaterBoltTest.java | 16 +- .../urlfrontier/URLFrontierContainer.java | 2 +- .../urlfrontier/URLFrontierContainerConfig.java | 2 +- external/warc/README.md | 43 +- external/warc/pom.xml | 20 +- .../warc/FileTimeSizeRotationPolicy.java | 2 +- .../apache}/stormcrawler/warc/GzipHdfsBolt.java | 2 +- .../stormcrawler/warc/WARCFileNameFormat.java | 2 +- .../apache}/stormcrawler/warc/WARCHdfsBolt.java | 6 +- .../stormcrawler/warc/WARCRecordFormat.java | 20 +- .../stormcrawler/warc/WARCRequestRecordFormat.java | 8 +- .../apache}/stormcrawler/warc/WARCSpout.java | 65 ++- .../stormcrawler/warc/WARCHdfsBoltTest.java | 10 +- .../stormcrawler/warc/WARCRecordFormatTest.java | 8 +- .../apache/stormcrawler/warc/WARCSpoutTest.java | 70 +++ external/warc/src/test/resources/test.warc.gz | Bin 0 -> 301243 bytes .../src/test/resources/unparsable-date.warc.gz | Bin 0 -> 938 bytes external/warc/src/test/resources/warc.inputs | 2 + pom.xml | 264 ++++++++-- 329 files changed, 5213 insertions(+), 2375 deletions(-)
