This is an automated email from the ASF dual-hosted git repository.

jnioche pushed a commit to branch 851
in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git

commit d2ef5a1fdfa2f3b4d96082e652579ade8db46eb0
Merge: 27414e98 bdc34cbc
Author: Julien Nioche <[email protected]>
AuthorDate: Thu Mar 28 15:32:25 2024 +0000

    Merge branch 'main' into 851

 .github/workflows/code_coverage.yml                |  29 ++
 .github/workflows/maven.yml                        |   6 +-
 DISCLAIMER                                         |  10 +
 NOTICE                                             |   4 +-
 README.md                                          |  36 +-
 THIRD-PARTY.properties                             |   4 +
 THIRD-PARTY.txt                                    | 547 +++++++++++++++++++++
 archetype/pom.xml                                  |   6 +-
 .../META-INF/maven/archetype-metadata.xml          |  71 +--
 .../main/resources/archetype-resources/README.md   |   2 +-
 .../archetype-resources/crawler-conf.yaml          |  58 ++-
 .../resources/archetype-resources/crawler.flux     |  20 +-
 .../src/main/resources/archetype-resources/pom.xml |  24 +-
 .../src/main/java/CrawlTopology.java               |  24 +-
 .../src/main/resources/jsoupfilters.json           |   6 +-
 .../src/main/resources/parsefilters.json           |   8 +-
 .../src/main/resources/urlfilters.json             |  18 +-
 core/pom.xml                                       |  54 +-
 .../stormcrawler/protocol/Protocol.java            |  40 --
 .../selenium/DelegatorRemoteDriverProtocol.java    |  81 ---
 .../protocol/selenium/RemoteDriverProtocol.java    |  87 ----
 .../apache}/stormcrawler/ConfigurableTopology.java |   6 +-
 .../apache}/stormcrawler/Constants.java            |   2 +-
 .../apache}/stormcrawler/JSONResource.java         |   2 +-
 .../apache}/stormcrawler/Metadata.java             |  20 +-
 .../apache}/stormcrawler/bolt/FeedParserBolt.java  |  28 +-
 .../apache}/stormcrawler/bolt/FetcherBolt.java     | 108 ++--
 .../apache}/stormcrawler/bolt/JSoupParserBolt.java |  48 +-
 .../stormcrawler/bolt/SimpleFetcherBolt.java       |  45 +-
 .../stormcrawler/bolt/SiteMapParserBolt.java       |  28 +-
 .../stormcrawler/bolt/StatusEmitterBolt.java       |  26 +-
 .../apache}/stormcrawler/bolt/URLFilterBolt.java   |  12 +-
 .../stormcrawler/bolt/URLPartitionerBolt.java      |   8 +-
 .../apache}/stormcrawler/filtering/URLFilter.java  |   6 +-
 .../apache}/stormcrawler/filtering/URLFilters.java |  84 +++-
 .../filtering/basic/BasicURLFilter.java            |   6 +-
 .../filtering/basic/BasicURLNormalizer.java        |  23 +-
 .../filtering/basic/SelfURLFilter.java             |   6 +-
 .../filtering/depth/MaxDepthFilter.java            |   8 +-
 .../stormcrawler/filtering/host/HostURLFilter.java |   6 +-
 .../filtering/metadata/MetadataFilter.java         |   6 +-
 .../filtering/regex/FastURLFilter.java             |  10 +-
 .../stormcrawler/filtering/regex/RegexRule.java    |   2 +-
 .../filtering/regex/RegexURLFilter.java            |   2 +-
 .../filtering/regex/RegexURLFilterBase.java        |   6 +-
 .../filtering/regex/RegexURLNormalizer.java        |   6 +-
 .../filtering/robots/RobotsFilter.java             |  14 +-
 .../filtering/sitemap/SitemapFilter.java           |  10 +-
 .../stormcrawler/indexing/AbstractIndexerBolt.java | 143 ++++--
 .../stormcrawler/indexing/DummyIndexer.java        |   8 +-
 .../stormcrawler/indexing/StdOutIndexer.java       |   8 +-
 .../stormcrawler/jsoup/LDJsonParseFilter.java      |  12 +-
 .../stormcrawler/jsoup/LinkParseFilter.java        |  20 +-
 .../apache}/stormcrawler/jsoup/XPathFilter.java    |  12 +-
 .../parse/DocumentFragmentBuilder.java             |   2 +-
 .../apache}/stormcrawler/parse/JSoupFilter.java    |   7 +-
 .../apache}/stormcrawler/parse/JSoupFilters.java   |  10 +-
 .../apache}/stormcrawler/parse/Outlink.java        |   4 +-
 .../apache}/stormcrawler/parse/ParseData.java      |   4 +-
 .../apache}/stormcrawler/parse/ParseFilter.java    |   8 +-
 .../apache}/stormcrawler/parse/ParseFilters.java   |   8 +-
 .../apache}/stormcrawler/parse/ParseResult.java    |   4 +-
 .../apache}/stormcrawler/parse/TextExtractor.java  |   4 +-
 .../parse/filter/CollectionTagger.java             |  10 +-
 .../CommaSeparatedToMultivaluedMetadata.java       |   8 +-
 .../parse/filter/DebugParseFilter.java             |   6 +-
 .../parse/filter/DomainParseFilter.java            |  12 +-
 .../parse/filter/LDJsonParseFilter.java            |  10 +-
 .../stormcrawler/parse/filter/LinkParseFilter.java |  20 +-
 .../parse/filter/MD5SignatureParseFilter.java      |  10 +-
 .../parse/filter/MimeTypeNormalization.java        |   8 +-
 .../stormcrawler/parse/filter/XPathFilter.java     |  10 +-
 .../persistence/AbstractQueryingSpout.java         |   9 +-
 .../persistence/AbstractStatusUpdaterBolt.java     |  10 +-
 .../persistence/AdaptiveScheduler.java             |  22 +-
 .../stormcrawler/persistence/DefaultScheduler.java |  14 +-
 .../persistence/EmptyQueueListener.java            |   2 +-
 .../persistence/MemoryStatusUpdater.java           |   6 +-
 .../stormcrawler/persistence/Scheduler.java        |   8 +-
 .../apache}/stormcrawler/persistence/Status.java   |   2 +-
 .../persistence/StdOutStatusUpdater.java           |   4 +-
 .../persistence/urlbuffer/AbstractURLBuffer.java   |   8 +-
 .../persistence/urlbuffer/PriorityURLBuffer.java   |   4 +-
 .../persistence/urlbuffer/SchedulingURLBuffer.java |   4 +-
 .../persistence/urlbuffer/SimpleURLBuffer.java     |   2 +-
 .../persistence/urlbuffer/URLBuffer.java           |  12 +-
 .../protocol/AbstractHttpProtocol.java             | 118 +----
 .../stormcrawler/protocol/DelegatorProtocol.java   | 159 ++++--
 .../apache}/stormcrawler/protocol/HttpHeaders.java |   2 +-
 .../protocol/HttpRobotRulesParser.java             |  87 +++-
 .../org/apache/stormcrawler/protocol/Protocol.java | 156 ++++++
 .../stormcrawler/protocol/ProtocolFactory.java     |  17 +-
 .../stormcrawler/protocol/ProtocolResponse.java    |  10 +-
 .../apache}/stormcrawler/protocol/RobotRules.java  |   2 +-
 .../stormcrawler/protocol/RobotRulesParser.java    |  84 +++-
 .../stormcrawler/protocol/file/FileProtocol.java   |  16 +-
 .../stormcrawler/protocol/file/FileResponse.java   |   8 +-
 .../protocol/httpclient/HttpProtocol.java          |  25 +-
 .../protocol/okhttp/DNSResolutionListener.java     |   2 +-
 .../stormcrawler/protocol/okhttp/HttpProtocol.java |  24 +-
 .../protocol/selenium/NavigationFilter.java        |   8 +-
 .../protocol/selenium/NavigationFilters.java       |  14 +-
 .../protocol/selenium/RemoteDriverProtocol.java    | 131 +++++
 .../protocol/selenium/SeleniumProtocol.java        |  26 +-
 .../stormcrawler/proxy/MultiProxyManager.java      |   8 +-
 .../apache}/stormcrawler/proxy/ProxyManager.java   |   4 +-
 .../apache}/stormcrawler/proxy/SCProxy.java        |  14 +-
 .../stormcrawler/proxy/SingleProxyManager.java     |  13 +-
 .../apache}/stormcrawler/spout/FileSpout.java      |  32 +-
 .../apache}/stormcrawler/spout/MemorySpout.java    |  10 +-
 .../stormcrawler/util/AbstractConfigurable.java    |   2 +-
 .../stormcrawler/util/CharsetIdentification.java   |   6 +-
 .../stormcrawler/util/CollectionMetric.java        |   2 +-
 .../apache}/stormcrawler/util/ConfUtils.java       |  83 +++-
 .../apache}/stormcrawler/util/Configurable.java    |   2 +-
 .../stormcrawler/util/ConfigurableHelper.java      |   2 +-
 .../apache}/stormcrawler/util/CookieConverter.java |   2 +-
 .../stormcrawler/util/InitialisationUtil.java      |   2 +-
 .../stormcrawler/util/MetadataTransfer.java        |  30 +-
 .../stormcrawler/util/PerSecondReducer.java        |   2 +-
 .../apache}/stormcrawler/util/RefreshTag.java      |   2 +-
 .../apache}/stormcrawler/util/RobotsTags.java      |   4 +-
 .../apache}/stormcrawler/util/StringTabScheme.java |   4 +-
 .../apache}/stormcrawler/util/URLPartitioner.java  |  28 +-
 .../stormcrawler/util/URLStreamGrouping.java       |   8 +-
 .../apache}/stormcrawler/util/URLUtil.java         |   2 +-
 core/src/main/resources/crawler-default.yaml       |  97 +++-
 .../apache/stormcrawler/MetadataTest.java}         |  26 +-
 .../stormcrawler/TestMetadataSerialization.java    |   2 +-
 .../apache}/stormcrawler/TestOutputCollector.java  |   2 +-
 .../apache}/stormcrawler/TestUtil.java             |   2 +-
 .../stormcrawler/bolt/AbstractFetcherBoltTest.java |  12 +-
 .../stormcrawler/bolt/FeedParserBoltTest.java      |  16 +-
 .../apache}/stormcrawler/bolt/FetcherBoltTest.java |   2 +-
 .../stormcrawler/bolt/JSoupParserBoltTest.java     |  16 +-
 .../stormcrawler/bolt/SimpleFetcherBoltTest.java   |   2 +-
 .../stormcrawler/bolt/SiteMapParserBoltTest.java   |  95 ++--
 .../stormcrawler/filtering/BasicURLFilterTest.java |   6 +-
 .../filtering/BasicURLNormalizerTest.java          |  22 +-
 .../stormcrawler/filtering/FastURLFilterTest.java  |   6 +-
 .../stormcrawler/filtering/HostURLFilterTest.java  |   6 +-
 .../stormcrawler/filtering/MaxDepthFilterTest.java |   8 +-
 .../stormcrawler/filtering/MetadataFilterTest.java |   6 +-
 .../stormcrawler/filtering/RegexFilterTest.java    |   6 +-
 .../ClassInheritingFomAbstractAndInterface.java    |   6 +-
 .../ClassInheritingFromAbstractClassOnly.java      |   4 +-
 .../ClassInheritingFromOpenClass.java              |   4 +-
 .../ClassWithoutValidConstructor.java              |   4 +-
 .../initialisation/FinalClassToInitialize.java     |   2 +-
 .../helper/initialisation/SimpleOpenClass.java     |   2 +-
 .../helper/initialisation/base/AbstractClass.java  |   2 +-
 .../helper/initialisation/base/ITestInterface.java |   2 +-
 .../OpenClassWithAbstractClassAndInterface.java    |   2 +-
 .../stormcrawler/indexer/BasicIndexingTest.java    |  27 +-
 .../apache}/stormcrawler/indexer/DummyIndexer.java |   6 +-
 .../stormcrawler/indexer/IndexerTester.java        |  10 +-
 .../apache}/stormcrawler/json/JsoupFilterTest.java |  10 +-
 .../stormcrawler/jsoup/JSoupFiltersTest.java       |  10 +-
 .../stormcrawler/parse/DuplicateLinksTest.java     |  10 +-
 .../apache}/stormcrawler/parse/ParsingTester.java  |   8 +-
 .../stormcrawler/parse/StackOverflowTest.java      |   8 +-
 .../stormcrawler/parse/TextExtractorTest.java      |   2 +-
 .../parse/filter/CSVMetadataFilterTest.java        |   8 +-
 .../parse/filter/CollectionTaggerTest.java         |   4 +-
 .../parse/filter/SubDocumentsFilterTest.java       |   8 +-
 .../parse/filter/SubDocumentsParseFilter.java      |   8 +-
 .../stormcrawler/parse/filter/XPathFilterTest.java |   8 +-
 .../persistence/AdaptiveSchedulerTest.java         |  19 +-
 .../persistence/DefaultSchedulerTest.java          |   4 +-
 .../stormcrawler/persistence/URLBufferTest.java    |  10 +-
 .../protocol/AbstractProtocolTest.java             |  96 ++++
 .../protocol/DelegationProtocolTest.java           |  41 +-
 .../stormcrawler/protocol/DummyProtocol.java}      |  28 +-
 .../stormcrawler/protocol/HttpHeadersTest.java     |   2 +-
 .../protocol/HttpRobotRulesParserTest.java         | 282 +++++++++++
 .../protocol/selenium/ProtocolTest.java            | 166 +++++++
 .../stormcrawler/proxy/MultiProxyManagerTest.java  |   2 +-
 .../apache}/stormcrawler/proxy/SCProxyTest.java    |   2 +-
 .../stormcrawler/proxy/SingleProxyManagerTest.java |   2 +-
 .../apache/stormcrawler/util/ConfUtilsTest.java    |  64 +++
 .../stormcrawler/util/CookieConverterTest.java     |   2 +-
 .../stormcrawler/util/InitialisationUtilTest.java  |   6 +-
 .../stormcrawler/util/MetadataTransferTest.java    |  61 ++-
 .../apache}/stormcrawler/util/RefreshTagTest.java  |   2 +-
 .../apache}/stormcrawler/util/RobotsTagsTest.java  |   4 +-
 core/src/test/resources/basicurlnormalizer.json    |   4 +-
 core/src/test/resources/delegator-conf.yaml        |  21 +-
 core/src/test/resources/test.jsoupfilters.json     |   8 +-
 core/src/test/resources/test.parsefilters.json     |   8 +-
 core/src/test/resources/test.subdocfilter.json     |   6 +-
 .../test/resources/tripadvisor.sitemap.index.xml   |  22 +
 core/src/test/resources/tripadvisor.sitemap.xml.gz | Bin 0 -> 1537978 bytes
 external/aws/README.md                             |   2 +-
 external/aws/pom.xml                               |   8 +-
 .../aws/bolt/CloudSearchConstants.java             |   2 +-
 .../aws/bolt/CloudSearchIndexerBolt.java           |  12 +-
 .../stormcrawler/aws/bolt/CloudSearchUtils.java    |   2 +-
 .../stormcrawler/aws/s3/AbstractS3CacheBolt.java   |   4 +-
 .../stormcrawler/aws/s3/S3CacheChecker.java        |   6 +-
 .../apache}/stormcrawler/aws/s3/S3Cacher.java      |   6 +-
 .../stormcrawler/aws/s3/S3ContentCacher.java       |   4 +-
 external/elasticsearch/README.md                   |  20 +-
 external/elasticsearch/archetype/pom.xml           |   4 +-
 .../META-INF/maven/archetype-metadata.xml          |  35 +-
 .../main/resources/archetype-resources/README.md   |   6 +-
 .../archetype-resources/crawler-conf.yaml          |  58 ++-
 .../resources/archetype-resources/es-conf.yaml     |   2 +-
 .../resources/archetype-resources/es-crawler.flux  |  52 +-
 .../archetype-resources/es-injection.flux          |  50 ++
 .../archetype-resources/kibana/importKibana.sh     |   8 +-
 .../src/main/resources/archetype-resources/pom.xml |  24 +-
 .../src/main/java/ESCrawlTopology.java             |  36 +-
 .../src/main/resources/jsoupfilters.json           |   6 +-
 .../src/main/resources/parsefilters.json           |   8 +-
 .../src/main/resources/urlfilters.json             |  18 +-
 external/elasticsearch/pom.xml                     |   9 +-
 .../BulkItemResponseToFailedFlag.java              |   2 +-
 .../elasticsearch/ElasticSearchConnection.java     |   4 +-
 .../elasticsearch/bolt/DeletionBolt.java           |  23 +-
 .../elasticsearch/bolt/IndexerBolt.java            |  20 +-
 .../filtering/JSONURLFilterWrapper.java            |  14 +-
 .../elasticsearch/metrics/MetricsConsumer.java     |   8 +-
 .../elasticsearch/metrics/StatusMetricsBolt.java   |   6 +-
 .../parse/filter/JSONResourceWrapper.java          |  14 +-
 .../elasticsearch/persistence/AbstractSpout.java   |  10 +-
 .../persistence/AggregationSpout.java              |   8 +-
 .../elasticsearch/persistence/CollapsingSpout.java |   4 +-
 .../elasticsearch/persistence/HybridSpout.java     |   6 +-
 .../elasticsearch/persistence/ScrollSpout.java     |  10 +-
 .../persistence/StatusUpdaterBolt.java             |  50 +-
 .../elasticsearch/bolt/IndexerBoltTest.java        |  12 +-
 .../elasticsearch/bolt/StatusBoltTest.java         |  14 +-
 external/langid/pom.xml                            |   6 +-
 .../stormcrawler/parse/filter/LanguageID.java      |  12 +-
 external/opensearch/OS_IndexInit.sh                |  23 -
 external/opensearch/README.md                      |  19 +-
 external/opensearch/archetype/pom.xml              |   4 +-
 .../META-INF/archetype-post-generate.groovy        |   5 +-
 .../META-INF/maven/archetype-metadata.xml          |  37 +-
 .../resources/archetype-resources/OS_IndexInit.sh  |  25 +
 .../main/resources/archetype-resources/README.md   |  17 +-
 .../archetype-resources/crawler-conf.yaml          |  58 ++-
 .../resources/archetype-resources/crawler.flux     |  50 +-
 .../dashboards/importDashboards.sh                 |   8 +-
 .../resources/archetype-resources/injection.flux   |  50 ++
 .../archetype-resources/opensearch-conf.yaml       |  12 +-
 .../src/main/resources/archetype-resources/pom.xml |  24 +-
 .../src/main/resources/indexer.mapping}            |   0
 .../src/main/resources/jsoupfilters.json           |   6 +-
 .../src/main/resources/metrics.mapping             |   0
 .../src/main/resources/parsefilters.json           |   8 +-
 .../src/main/resources/status.mapping              |   0
 .../src/main/resources/urlfilters.json             |  18 +-
 external/opensearch/opensearch-conf.yaml           |  12 +-
 external/opensearch/pom.xml                        |  24 +-
 .../stormcrawler/opensearch/bolt/DeletionBolt.java |  94 ----
 .../opensearch/BulkItemResponseToFailedFlag.java   |  10 +-
 .../apache}/stormcrawler/opensearch/Constants.java |   2 +-
 .../stormcrawler/opensearch/IndexCreation.java     |  15 +-
 .../opensearch/OpenSearchConnection.java}          | 102 ++--
 .../stormcrawler/opensearch/bolt/DeletionBolt.java | 308 ++++++++++++
 .../stormcrawler/opensearch/bolt/IndexerBolt.java  |  59 +--
 .../opensearch/filtering/JSONURLFilterWrapper.java |  16 +-
 .../opensearch/metrics/MetricsConsumer.java        |  26 +-
 .../opensearch/metrics/StatusMetricsBolt.java      |  20 +-
 .../parse/filter/JSONResourceWrapper.java          |  38 +-
 .../opensearch/persistence/AbstractSpout.java      |  78 +--
 .../opensearch/persistence/AggregationSpout.java   |  18 +-
 .../opensearch/persistence/HybridSpout.java        |  22 +-
 .../opensearch/persistence/StatusUpdaterBolt.java  |  81 +--
 .../opensearch/bolt/AbstractOpenSearchTest.java    |  46 ++
 .../opensearch/bolt/IndexerBoltTest.java           |  30 +-
 .../opensearch/bolt/StatusBoltTest.java            |  38 +-
 .../resources/indexer.mapping}                     |   0
 .../src/{main => test}/resources/metrics.mapping   |   0
 .../src/{main => test}/resources/status.mapping    |   0
 external/pom.xml                                   |  23 +-
 external/solr/README.md                            |   2 +-
 external/solr/cores/status/conf/schema.xml         |   2 +-
 external/solr/pom.xml                              |  14 +-
 external/solr/solr-conf.yaml                       |   2 +-
 .../apache}/stormcrawler/solr/SeedInjector.java    |  10 +-
 .../apache}/stormcrawler/solr/SolrConnection.java  |   4 +-
 .../stormcrawler/solr/SolrCrawlTopology.java       |  26 +-
 .../stormcrawler/solr/bolt/DeletionBolt.java       |  86 ++++
 .../stormcrawler/solr/bolt/IndexerBolt.java        |  13 +-
 .../stormcrawler/solr/metrics/MetricsConsumer.java |   6 +-
 .../stormcrawler/solr/persistence/SolrSpout.java   |  11 +-
 .../solr/persistence/StatusUpdaterBolt.java        |  17 +-
 external/sql/pom.xml                               |   6 +-
 external/sql/sql-conf.yaml                         |   2 +-
 .../apache}/stormcrawler/sql/Constants.java        |   2 +-
 .../apache}/stormcrawler/sql/IndexerBolt.java      |  12 +-
 .../apache}/stormcrawler/sql/SQLSpout.java         |  10 +-
 .../apache}/stormcrawler/sql/SQLUtil.java          |   2 +-
 .../stormcrawler/sql/StatusUpdaterBolt.java        |  33 +-
 .../stormcrawler/sql/metrics/MetricsConsumer.java  |  18 +-
 external/tika/README.md                            |   4 +-
 external/tika/pom.xml                              |  12 +-
 .../apache}/stormcrawler/tika/DOMBuilder.java      |   2 +-
 .../apache}/stormcrawler/tika/ParserBolt.java      |  38 +-
 .../apache}/stormcrawler/tika/RedirectionBolt.java |   4 +-
 .../stormcrawler/tika/XMLCharacterRecognizer.java  |   2 +-
 .../apache}/stormcrawler/tika/ParserBoltTest.java  |  16 +-
 external/urlfrontier/README.md                     |   2 +-
 external/urlfrontier/pom.xml                       |   9 +-
 .../stormcrawler/urlfrontier/Constants.java        |   2 +-
 .../urlfrontier/ManagedChannelUtil.java            |   4 +-
 .../apache}/stormcrawler/urlfrontier/Spout.java    |  10 +-
 .../urlfrontier/StatusUpdaterBolt.java             |  14 +-
 .../urlfrontier/StatusUpdaterBoltTest.java         |  16 +-
 .../urlfrontier/URLFrontierContainer.java          |   2 +-
 .../urlfrontier/URLFrontierContainerConfig.java    |   2 +-
 external/warc/README.md                            |  43 +-
 external/warc/pom.xml                              |  20 +-
 .../warc/FileTimeSizeRotationPolicy.java           |   2 +-
 .../apache}/stormcrawler/warc/GzipHdfsBolt.java    |   2 +-
 .../stormcrawler/warc/WARCFileNameFormat.java      |   2 +-
 .../apache}/stormcrawler/warc/WARCHdfsBolt.java    |   6 +-
 .../stormcrawler/warc/WARCRecordFormat.java        |  20 +-
 .../stormcrawler/warc/WARCRequestRecordFormat.java |   8 +-
 .../apache}/stormcrawler/warc/WARCSpout.java       |  65 ++-
 .../stormcrawler/warc/WARCHdfsBoltTest.java        |  10 +-
 .../stormcrawler/warc/WARCRecordFormatTest.java    |   8 +-
 .../apache/stormcrawler/warc/WARCSpoutTest.java    |  70 +++
 external/warc/src/test/resources/test.warc.gz      | Bin 0 -> 301243 bytes
 .../src/test/resources/unparsable-date.warc.gz     | Bin 0 -> 938 bytes
 external/warc/src/test/resources/warc.inputs       |   2 +
 pom.xml                                            | 264 ++++++++--
 329 files changed, 5213 insertions(+), 2375 deletions(-)

Reply via email to