This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 792ed2891 NUTCH-3015 Add more CI steps to GitHub master-build.yml
(#790)
792ed2891 is described below
commit 792ed28914f4beb2fb8b8ce28eebe17196c92af1
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Fri Oct 27 15:04:22 2023 -0700
NUTCH-3015 Add more CI steps to GitHub master-build.yml (#790)
---
.../{master-build.yml => dependency-check.yml} | 25 ++++-----
.github/workflows/master-build.yml | 64 +++++++++++++++++-----
.gitignore | 1 +
build.xml | 52 +++++++++++++++---
.../dependency-check-suppressions.xml | 5 --
src/java/overview.html | 16 ++++++
.../creativecommons/conf/crawl-urlfilter.txt | 15 +++++
src/plugin/creativecommons/conf/nutch-site.xml | 16 ++++++
src/plugin/creativecommons/data/anchor.html | 16 ++++++
src/plugin/creativecommons/data/rdf.html | 16 ++++++
src/plugin/creativecommons/data/rel.html | 16 ++++++
src/plugin/creativecommons/ivy.xml | 1 -
src/plugin/exchange-jexl/README.md | 17 ++++++
src/plugin/exchange-jexl/ivy.xml | 1 -
src/plugin/feed/ivy.xml | 1 -
src/plugin/headings/ivy.xml | 1 -
src/plugin/index-anchor/ivy.xml | 1 -
src/plugin/index-basic/ivy.xml | 1 -
src/plugin/index-geoip/ivy.xml | 1 -
src/plugin/index-geoip/plugin.xml | 1 +
src/plugin/index-jexl-filter/ivy.xml | 1 -
src/plugin/index-links/README.md | 17 ++++++
src/plugin/index-links/ivy.xml | 1 -
src/plugin/index-metadata/ivy.xml | 1 -
src/plugin/index-more/ivy.xml | 1 -
src/plugin/index-replace/ivy.xml | 1 -
.../index-replace/sample/testIndexReplace.html | 16 ++++++
src/plugin/index-static/ivy.xml | 1 -
src/plugin/indexer-cloudsearch/README.md | 17 ++++++
src/plugin/indexer-cloudsearch/createCSDomain.sh | 15 +++++
src/plugin/indexer-csv/README.md | 17 ++++++
src/plugin/indexer-csv/ivy.xml | 1 -
src/plugin/indexer-dummy/README.md | 17 ++++++
src/plugin/indexer-dummy/ivy.xml | 1 -
src/plugin/indexer-elastic/README.md | 17 ++++++
.../{howto_upgrade_es.txt => howto_upgrade_es.md} | 17 ++++++
src/plugin/indexer-kafka/ivy.xml | 1 -
src/plugin/indexer-opensearch-1x/README.md | 17 ++++++
..._opensearch.txt => howto_upgrade_opensearch.md} | 17 ++++++
src/plugin/indexer-rabbit/README.md | 17 ++++++
src/plugin/indexer-rabbit/ivy.xml | 1 -
src/plugin/indexer-solr/README.md | 17 ++++++
...owto_upgrade_solr.txt => howto_upgrade_solr.md} | 17 ++++++
src/plugin/indexer-solr/ivy.xml | 25 +++++----
src/plugin/indexer-solr/plugin.xml | 26 +++++----
src/plugin/language-identifier/ivy.xml | 1 -
src/plugin/lib-htmlunit/ivy.xml | 1 -
src/plugin/lib-http/ivy.xml | 1 -
src/plugin/lib-nekohtml/ivy.xml | 1 -
src/plugin/lib-rabbitmq/ivy.xml | 1 -
src/plugin/lib-regex-filter/ivy.xml | 1 -
src/plugin/lib-selenium/README.md | 17 ++++++
.../howto_upgrade_selenium.md} | 42 +++++---------
src/plugin/lib-selenium/howto_upgrade_selenium.txt | 15 -----
src/plugin/lib-selenium/ivy.xml | 1 -
src/plugin/lib-xml/ivy.xml | 1 -
src/plugin/microformats-reltag/ivy.xml | 1 -
src/plugin/mimetype-filter/ivy.xml | 1 -
src/plugin/nutch-extensionpoints/ivy.xml | 1 -
src/plugin/parse-ext/command | 15 +++++
src/plugin/parse-ext/ivy.xml | 1 -
src/plugin/parse-html/ivy.xml | 1 -
src/plugin/parse-js/ivy.xml | 1 -
.../parse-js/sample/parse_embedded_js_test.html | 16 ++++++
src/plugin/parse-js/sample/parse_pure_js_test.js | 15 +++++
src/plugin/parse-metatags/ivy.xml | 1 -
src/plugin/parse-metatags/sample/testMetatags.html | 16 ++++++
.../sample/testMultivalueMetatags.html | 16 ++++++
...owto_upgrade_tika.txt => howto_upgrade_tika.md} | 17 ++++++
src/plugin/parse-tika/ivy.xml | 1 -
src/plugin/parse-tika/sample/nutch.html | 16 ++++++
src/plugin/parse-zip/ivy.xml | 1 -
src/plugin/parsefilter-debug/ivy.xml | 1 -
src/plugin/parsefilter-naivebayes/ivy.xml | 1 -
.../parsefilter-regex/data/regex-parsefilter.txt | 15 +++++
src/plugin/parsefilter-regex/ivy.xml | 1 -
src/plugin/protocol-file/ivy.xml | 1 -
.../protocol-file/sample/testprotocolfile.txt | 15 +++++
.../sample/testprotocolfile_(encoded).txt | 15 +++++
src/plugin/protocol-foo/ivy.xml | 1 -
src/plugin/protocol-foo/plugin.xml | 1 -
src/plugin/protocol-ftp/ivy.xml | 1 -
src/plugin/protocol-htmlunit/ivy.xml | 1 -
src/plugin/protocol-http/ivy.xml | 1 -
src/plugin/protocol-httpclient/ivy.xml | 1 -
src/plugin/protocol-interactiveselenium/README.md | 17 ++++++
src/plugin/protocol-interactiveselenium/ivy.xml | 1 -
..._upgrade_okhttp.txt => howto_upgrade_okhttp.md} | 17 ++++++
src/plugin/protocol-okhttp/ivy.xml | 1 -
src/plugin/protocol-selenium/README.md | 17 ++++++
src/plugin/protocol-selenium/ivy.xml | 1 -
src/plugin/publish-rabbitmq/ivy.xml | 1 -
src/plugin/scoring-depth/ivy.xml | 1 -
src/plugin/scoring-link/ivy.xml | 1 -
src/plugin/scoring-metadata/ivy.xml | 1 -
src/plugin/scoring-opic/ivy.xml | 1 -
src/plugin/scoring-orphan/ivy.xml | 1 -
src/plugin/scoring-similarity/ivy.xml | 1 -
src/plugin/subcollection/ivy.xml | 1 -
src/plugin/tld/ivy.xml | 1 -
src/plugin/urlfilter-automaton/ivy.xml | 1 -
src/plugin/urlfilter-domain/data/hosts.txt | 15 +++++
src/plugin/urlfilter-domain/ivy.xml | 1 -
src/plugin/urlfilter-domaindenylist/data/hosts.txt | 15 +++++
src/plugin/urlfilter-domaindenylist/ivy.xml | 1 -
src/plugin/urlfilter-fast/README.md | 16 ++++++
src/plugin/urlfilter-fast/ivy.xml | 1 -
src/plugin/urlfilter-ignoreexempt/README.md | 17 ++++++
src/plugin/urlfilter-ignoreexempt/ivy.xml | 1 -
src/plugin/urlfilter-prefix/ivy.xml | 1 -
src/plugin/urlfilter-regex/ivy.xml | 1 -
src/plugin/urlfilter-suffix/ivy.xml | 1 -
src/plugin/urlfilter-validator/ivy.xml | 1 -
src/plugin/urlmeta/ivy.xml | 1 -
src/plugin/urlnormalizer-ajax/ivy.xml | 1 -
src/plugin/urlnormalizer-basic/ivy.xml | 1 -
src/plugin/urlnormalizer-host/data/hosts.txt | 15 +++++
src/plugin/urlnormalizer-host/ivy.xml | 1 -
src/plugin/urlnormalizer-pass/ivy.xml | 1 -
.../urlnormalizer-protocol/data/protocols.txt | 15 +++++
src/plugin/urlnormalizer-protocol/ivy.xml | 1 -
src/plugin/urlnormalizer-querystring/ivy.xml | 1 -
src/plugin/urlnormalizer-regex/ivy.xml | 1 -
.../sample/regex-normalize-default.test | 15 +++++
.../sample/regex-normalize-scope1.test | 15 +++++
src/plugin/urlnormalizer-slash/data/slashes.txt | 15 +++++
src/plugin/urlnormalizer-slash/ivy.xml | 1 -
src/test/crawl-tests.xml | 16 ++++++
src/test/filter-all.txt | 15 +++++
src/test/log4j.properties | 15 +++++
src/test/nutch-site.xml | 16 ++++++
.../fetch-test-site/dup_of_pagea.html | 16 ++++++
src/testresources/fetch-test-site/exception.html | 16 ++++++
src/testresources/fetch-test-site/index.html | 16 ++++++
.../fetch-test-site/nested_spider_trap.html | 16 ++++++
src/testresources/fetch-test-site/pagea.html | 16 ++++++
src/testresources/fetch-test-site/pageb.html | 16 ++++++
src/testresources/fetch-test-site/robots.txt | 14 +++++
138 files changed, 1017 insertions(+), 177 deletions(-)
diff --git a/.github/workflows/master-build.yml
b/.github/workflows/dependency-check.yml
similarity index 72%
copy from .github/workflows/master-build.yml
copy to .github/workflows/dependency-check.yml
index ba1d470ec..f07f746a0 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/dependency-check.yml
@@ -1,4 +1,3 @@
-#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
@@ -13,28 +12,26 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-#
name: master pr build
on:
- push:
- branches: [ master ]
- pull_request:
- branches: [ master ]
+ schedule:
+ - cron: '0 0 * * *' # every day at midnight
jobs:
- build:
- runs-on: ubuntu-latest
+ dependency-check:
strategy:
matrix:
- java: [ '11' ]
-
+ java: ['11']
+ os: [ubuntu-latest]
+ runs-on: ${{ matrix.os }}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
- name: Set up JDK ${{ matrix.java }}
- uses: actions/setup-java@v1
+ uses: actions/setup-java@v3
with:
java-version: ${{ matrix.java }}
- - name: Build with Ant
- run: ant clean nightly javadoc -buildfile build.xml
+ distribution: 'temurin'
+ - name: Dependency check
+ run: ant clean dependency-check -buildfile build.xml
diff --git a/.github/workflows/master-build.yml
b/.github/workflows/master-build.yml
index ba1d470ec..e0af58df0 100644
--- a/.github/workflows/master-build.yml
+++ b/.github/workflows/master-build.yml
@@ -1,4 +1,3 @@
-#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
@@ -13,28 +12,67 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-#
-name: master pr build
+name: master pull request ci
on:
push:
- branches: [ master ]
+ branches: [master]
pull_request:
- branches: [ master ]
+ types: [opened, synchronize, reopened]
+ branches: [master]
jobs:
- build:
- runs-on: ubuntu-latest
+ javadoc:
strategy:
matrix:
- java: [ '11' ]
-
+ java: ['11']
+ os: [ubuntu-latest]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@v3
+ with:
+ java-version: ${{ matrix.java }}
+ distribution: 'temurin'
+ - name: Javadoc
+ run: ant clean javadoc -buildfile build.xml
+ rat:
+ strategy:
+ matrix:
+ java: ['11']
+ os: [ubuntu-latest]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@v3
+ with:
+ java-version: ${{ matrix.java }}
+ distribution: 'temurin'
+ - name: Run Apache Rat
+ run: ant clean run-rat -buildfile build.xml
+ - name: Cache unknown licenses
+ run: echo "UNKNOWN_LICENSES=$(sed -n 18p
/home/runner/work/nutch/nutch/build/apache-rat-report.txt)" >> $GITHUB_ENV
+ - name: Versions
+ run: |
+ echo $UNKNOWN_LICENSES
+ - name: Fail if any unknown licenses
+ if: ${{ env.UNKNOWN_LICENSES != '0 Unknown Licenses' }}
+ run: exit 1
+ test:
+ strategy:
+ matrix:
+ java: ['11']
+ os: [ubuntu-latest, macos-latest]
+ runs-on: ${{ matrix.os }}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v4
- name: Set up JDK ${{ matrix.java }}
- uses: actions/setup-java@v1
+ uses: actions/setup-java@v3
with:
java-version: ${{ matrix.java }}
- - name: Build with Ant
- run: ant clean nightly javadoc -buildfile build.xml
+ distribution: 'temurin'
+ - name: Test
+ run: ant clean test -buildfile build.xml
diff --git a/.gitignore b/.gitignore
index b46690852..12365dd0d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ csvindexwriter
lib/spotbugs-*
ivy/dependency-check-ant/*
.gradle*
+ivy/apache-rat-*
diff --git a/build.xml b/build.xml
index b44581405..dd9797302 100644
--- a/build.xml
+++ b/build.xml
@@ -38,7 +38,7 @@
<property name="maven-javadoc-jar"
value="${release.dir}/${artifactId}-${version}-javadoc.jar" />
<property name="maven-sources-jar"
value="${release.dir}/${artifactId}-${version}-sources.jar" />
- <property name="dependency-check-ant.version" value="7.1.1" />
+ <property name="dependency-check-ant.version" value="8.4.2" />
<property name="dependency-check-ant.home"
value="${ivy.dir}/dependency-check-ant" />
<property name="dependency-check-ant.jar"
value="${dependency-check-ant.home}/dependency-check-ant.jar" />
@@ -48,7 +48,7 @@
<property name="spotbugs.home"
value="${ivy.dir}/spotbugs-${spotbugs.version}" />
<property name="spotbugs.jar" value="${spotbugs.home}/lib/spotbugs-ant.jar"
/>
- <property name="apache-rat.version" value="0.14" />
+ <property name="apache-rat.version" value="0.15" />
<property name="apache-rat.home"
value="${ivy.dir}/apache-rat-${apache-rat.version}" />
<property name="apache-rat.jar"
value="${apache-rat.home}/apache-rat-${apache-rat.version}.jar" />
@@ -640,13 +640,15 @@
</fileset>
</path>
- <target name="report-vulnerabilities" depends="jar, compile-plugins,
dependency-check-ant-download" description="--> check dependencies for security
vulnerabilities">
+ <target name="dependency-check" depends="jar, compile-plugins,
dependency-check-ant-download" description="--> check dependencies for security
vulnerabilities">
<taskdef resource="dependency-check-taskdefs.properties">
<classpath refid="dependency-check-ant.path" />
</taskdef>
<dependency-check projectname="${name}"
reportoutputdirectory="${dependency-check-ant.home}"
- reportformat="ALL">
+ reportformat="ALL"
+ assemblyAnalyzerEnabled="false"
+ failBuildOnCVSS="1">
<suppressionfile
path="${dependency-check-ant.home}/dependency-check-suppressions.xml" />
<retirejsFilter regex="copyright.*jeremy long" />
<fileset dir="${build.dir}">
@@ -1025,7 +1027,7 @@
<target name="apache-rat-download-unchecked" unless="apache-rat.jar.found"
description="--> downloads the Apache Rat jar">
- <get
src="https://www.apache.org/dist/creadur/apache-rat-${apache-rat.version}/apache-rat-${apache-rat.version}-bin.tar.gz"
+ <get
src="https://archive.apache.org/dist/creadur/apache-rat-${apache-rat.version}/apache-rat-${apache-rat.version}-bin.tar.gz"
dest="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz"
usetimestamp="false" />
<untar src="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz"
@@ -1035,8 +1037,8 @@
<delete file="${ivy.dir}/apache-rat-${apache-rat.version}-bin.tar.gz" />
</target>
- <target name="rat-sources" depends="init, apache-rat-download"
- description="--> runs RAT tasks over src/java">
+ <target name="run-rat" depends="init, apache-rat-download"
+ description="--> runs Apache Rat on codebase">
<taskdef
uri="antlib:org.apache.rat.anttasks"
resource="org/apache/rat/anttasks/antlib.xml">
@@ -1047,8 +1049,40 @@
<rat:report
reportFile="${build.dir}/apache-rat-report.txt">
<fileset dir="src">
- <include name="java/**/*"/>
- <include name="plugin/**/src/**/*"/>
+ <include name="**"/>
+ <exclude
name="plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/langmappings.properties"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/de.test"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/en.test"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/es.test"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/fi.test"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/fr.test"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/it.test"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/nl.test"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/pt.test"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/sv.test"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/test-referencial.txt"/>
+ <exclude
name="plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/da.test"/>
+ <exclude name="plugin/parse-tika/sample/ootest.txt"/>
+ <exclude name="plugin/parse-tika/sample/test.rtf"/>
+ <exclude name="plugin/urlfilter-ignoreexempt/data/.donotdelete"/>
+ <exclude name="plugin/urlfilter-automaton/sample/Benchmarks.rules"/>
+ <exclude name="plugin/urlfilter-automaton/sample/Benchmarks.urls"/>
+ <exclude
name="plugin/urlfilter-automaton/sample/IntranetCrawling.rules"/>
+ <exclude
name="plugin/urlfilter-automaton/sample/IntranetCrawling.urls"/>
+ <exclude
name="plugin/urlfilter-automaton/sample/WholeWebCrawling.rules"/>
+ <exclude
name="plugin/urlfilter-automaton/sample/WholeWebCrawling.urls"/>
+ <exclude name="plugin/urlfilter-fast/sample/Benchmarks.urls"/>
+ <exclude
name="plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt"/>
+ <exclude name="plugin/urlfilter-fast/sample/fast-urlfilter-test.txt"/>
+ <exclude name="plugin/urlfilter-fast/sample/test.urls"/>
+ <exclude name="plugin/urlfilter-regex/sample/Benchmarks.rules"/>
+ <exclude name="plugin/urlfilter-regex/sample/Benchmarks.urls"/>
+ <exclude name="plugin/urlfilter-regex/sample/IntranetCrawling.rules"/>
+ <exclude name="plugin/urlfilter-regex/sample/IntranetCrawling.urls"/>
+ <exclude name="plugin/urlfilter-regex/sample/WholeWebCrawling.rules"/>
+ <exclude name="plugin/urlfilter-regex/sample/WholeWebCrawling.urls"/>
+ <exclude name="plugin/urlfilter-regex/sample/nutch1838.rules"/>
+ <exclude name="plugin/urlfilter-regex/sample/nutch1838.urls"/>
</fileset>
</rat:report>
</target>
diff --git a/ivy/dependency-check-ant/dependency-check-suppressions.xml
b/ivy/dependency-check-ant/dependency-check-suppressions.xml
index e7de8febb..a7f4ca16d 100644
--- a/ivy/dependency-check-ant/dependency-check-suppressions.xml
+++ b/ivy/dependency-check-ant/dependency-check-suppressions.xml
@@ -1,8 +1,3 @@
<?xml version="1.0" encoding="UTF-8"?>
<suppressions
xmlns="https://jeremylong.github.io/DependencyCheck/dependency-suppression.1.1.xsd">
- <suppress>
- <notes>only applies to tika-server < 1.18</notes>
- <gav
regex="true">^org\.(apache\.tika:tika-(core|parsers)|gagravarr:vorbis-java-tika):.*$</gav>
- <cve>CVE-2018-1335</cve>
- </suppress>
</suppressions>
diff --git a/src/java/overview.html b/src/java/overview.html
index 11321417b..3de53a7d2 100644
--- a/src/java/overview.html
+++ b/src/java/overview.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<html>
<head>
<title>Apache Nutch</title>
diff --git a/src/plugin/creativecommons/conf/crawl-urlfilter.txt
b/src/plugin/creativecommons/conf/crawl-urlfilter.txt
index 324617f07..eb6786e4b 100644
--- a/src/plugin/creativecommons/conf/crawl-urlfilter.txt
+++ b/src/plugin/creativecommons/conf/crawl-urlfilter.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# Creative Commnons crawl filter
# Each non-comment, non-blank line contains a regular expression
diff --git a/src/plugin/creativecommons/conf/nutch-site.xml
b/src/plugin/creativecommons/conf/nutch-site.xml
index e28e12a9a..4b343b2cc 100644
--- a/src/plugin/creativecommons/conf/nutch-site.xml
+++ b/src/plugin/creativecommons/conf/nutch-site.xml
@@ -1,5 +1,21 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!-- Creative Commons' Nutch configuration -->
diff --git a/src/plugin/creativecommons/data/anchor.html
b/src/plugin/creativecommons/data/anchor.html
index 90b522759..3267bc9ea 100755
--- a/src/plugin/creativecommons/data/anchor.html
+++ b/src/plugin/creativecommons/data/anchor.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">
<html>
<head>
diff --git a/src/plugin/creativecommons/data/rdf.html
b/src/plugin/creativecommons/data/rdf.html
index fb2c34dfe..60c27cc54 100755
--- a/src/plugin/creativecommons/data/rdf.html
+++ b/src/plugin/creativecommons/data/rdf.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
diff --git a/src/plugin/creativecommons/data/rel.html
b/src/plugin/creativecommons/data/rel.html
index 413d52f86..3d11572d8 100755
--- a/src/plugin/creativecommons/data/rel.html
+++ b/src/plugin/creativecommons/data/rel.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head>
</head><body>
diff --git a/src/plugin/creativecommons/ivy.xml
b/src/plugin/creativecommons/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/creativecommons/ivy.xml
+++ b/src/plugin/creativecommons/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/exchange-jexl/README.md
b/src/plugin/exchange-jexl/README.md
index 2d2024276..35a711b90 100644
--- a/src/plugin/exchange-jexl/README.md
+++ b/src/plugin/exchange-jexl/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
exchange-jexl plugin for Nutch
==============================
diff --git a/src/plugin/exchange-jexl/ivy.xml b/src/plugin/exchange-jexl/ivy.xml
index 1275664e5..cb5a0f186 100644
--- a/src/plugin/exchange-jexl/ivy.xml
+++ b/src/plugin/exchange-jexl/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/feed/ivy.xml b/src/plugin/feed/ivy.xml
index 7e3f4ede3..a7671307b 100644
--- a/src/plugin/feed/ivy.xml
+++ b/src/plugin/feed/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/headings/ivy.xml b/src/plugin/headings/ivy.xml
index a8d6b9d48..63007f93c 100644
--- a/src/plugin/headings/ivy.xml
+++ b/src/plugin/headings/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-anchor/ivy.xml b/src/plugin/index-anchor/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/index-anchor/ivy.xml
+++ b/src/plugin/index-anchor/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-basic/ivy.xml b/src/plugin/index-basic/ivy.xml
index 673ea7f09..7bae19bb9 100644
--- a/src/plugin/index-basic/ivy.xml
+++ b/src/plugin/index-basic/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml
index 2eda5a63f..45a638819 100644
--- a/src/plugin/index-geoip/ivy.xml
+++ b/src/plugin/index-geoip/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-geoip/plugin.xml
b/src/plugin/index-geoip/plugin.xml
index c4efadf94..dda1b6a7b 100644
--- a/src/plugin/index-geoip/plugin.xml
+++ b/src/plugin/index-geoip/plugin.xml
@@ -1,3 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-jexl-filter/ivy.xml
b/src/plugin/index-jexl-filter/ivy.xml
index 624dcaf4a..3d4fc905c 100644
--- a/src/plugin/index-jexl-filter/ivy.xml
+++ b/src/plugin/index-jexl-filter/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-links/README.md b/src/plugin/index-links/README.md
index f25d1cf6d..ac0f071f4 100644
--- a/src/plugin/index-links/README.md
+++ b/src/plugin/index-links/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
indexer-links plugin for Nutch
==============================
diff --git a/src/plugin/index-links/ivy.xml b/src/plugin/index-links/ivy.xml
index 624dcaf4a..3d4fc905c 100644
--- a/src/plugin/index-links/ivy.xml
+++ b/src/plugin/index-links/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-metadata/ivy.xml
b/src/plugin/index-metadata/ivy.xml
index 1275664e5..cb5a0f186 100644
--- a/src/plugin/index-metadata/ivy.xml
+++ b/src/plugin/index-metadata/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-more/ivy.xml b/src/plugin/index-more/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/index-more/ivy.xml
+++ b/src/plugin/index-more/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-replace/ivy.xml b/src/plugin/index-replace/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/index-replace/ivy.xml
+++ b/src/plugin/index-replace/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/index-replace/sample/testIndexReplace.html
b/src/plugin/index-replace/sample/testIndexReplace.html
index 0b90fc211..fb2ef03a5 100644
--- a/src/plugin/index-replace/sample/testIndexReplace.html
+++ b/src/plugin/index-replace/sample/testIndexReplace.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<html>
<head>
<title>Testing the power of the index-replace plugin</title>
diff --git a/src/plugin/index-static/ivy.xml b/src/plugin/index-static/ivy.xml
index 1275664e5..cb5a0f186 100644
--- a/src/plugin/index-static/ivy.xml
+++ b/src/plugin/index-static/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/indexer-cloudsearch/README.md
b/src/plugin/indexer-cloudsearch/README.md
index 10b5daa90..a0609c0fb 100644
--- a/src/plugin/indexer-cloudsearch/README.md
+++ b/src/plugin/indexer-cloudsearch/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
AWS CloudSearch plugin for Nutch
================================
diff --git a/src/plugin/indexer-cloudsearch/createCSDomain.sh
b/src/plugin/indexer-cloudsearch/createCSDomain.sh
index 24fb0156c..1cb8481fe 100644
--- a/src/plugin/indexer-cloudsearch/createCSDomain.sh
+++ b/src/plugin/indexer-cloudsearch/createCSDomain.sh
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# example of domain configuration for CloudSearch
DOMAIN="$1"
diff --git a/src/plugin/indexer-csv/README.md b/src/plugin/indexer-csv/README.md
index 80220974a..4d1288b19 100644
--- a/src/plugin/indexer-csv/README.md
+++ b/src/plugin/indexer-csv/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
indexer-csv plugin for Nutch
============================
diff --git a/src/plugin/indexer-csv/ivy.xml b/src/plugin/indexer-csv/ivy.xml
index 75b5d54e5..e7bf87546 100644
--- a/src/plugin/indexer-csv/ivy.xml
+++ b/src/plugin/indexer-csv/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/indexer-dummy/README.md
b/src/plugin/indexer-dummy/README.md
index 2a4b2bd15..a7fa53009 100644
--- a/src/plugin/indexer-dummy/README.md
+++ b/src/plugin/indexer-dummy/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
indexer-dummy plugin for Nutch
==============================
diff --git a/src/plugin/indexer-dummy/ivy.xml b/src/plugin/indexer-dummy/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/indexer-dummy/ivy.xml
+++ b/src/plugin/indexer-dummy/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/indexer-elastic/README.md
b/src/plugin/indexer-elastic/README.md
index 466762e1c..3dfd888ff 100644
--- a/src/plugin/indexer-elastic/README.md
+++ b/src/plugin/indexer-elastic/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
indexer-elastic plugin for Nutch
================================
diff --git a/src/plugin/indexer-elastic/howto_upgrade_es.txt
b/src/plugin/indexer-elastic/howto_upgrade_es.md
similarity index 60%
rename from src/plugin/indexer-elastic/howto_upgrade_es.txt
rename to src/plugin/indexer-elastic/howto_upgrade_es.md
index a8156444c..b57e0c02f 100644
--- a/src/plugin/indexer-elastic/howto_upgrade_es.txt
+++ b/src/plugin/indexer-elastic/howto_upgrade_es.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
1. Upgrade Elasticsearch dependency in src/plugin/indexer-elastic/ivy.xml
2. Upgrade the Elasticsearch specific dependencies in
src/plugin/indexer-elastic/plugin.xml
diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml
index 7bdd94324..9d605c50b 100644
--- a/src/plugin/indexer-kafka/ivy.xml
+++ b/src/plugin/indexer-kafka/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/indexer-opensearch-1x/README.md
b/src/plugin/indexer-opensearch-1x/README.md
index 52e5844af..e5e76f0b6 100644
--- a/src/plugin/indexer-opensearch-1x/README.md
+++ b/src/plugin/indexer-opensearch-1x/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
indexer-opensearch1x plugin for Nutch
================================
diff --git a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt
b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md
similarity index 60%
rename from src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt
rename to src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md
index 072590044..c9b723ffc 100644
--- a/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.txt
+++ b/src/plugin/indexer-opensearch-1x/howto_upgrade_opensearch.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
1. Upgrade OpenSearch dependency in src/plugin/indexer-opensearch-1x/ivy.xml
2. Upgrade the OpenSearch specific dependencies in
src/plugin/indexer-opensearch-1x/plugin.xml
diff --git a/src/plugin/indexer-rabbit/README.md
b/src/plugin/indexer-rabbit/README.md
index 6ea09a915..8040cd6c7 100644
--- a/src/plugin/indexer-rabbit/README.md
+++ b/src/plugin/indexer-rabbit/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
indexer-rabbit plugin for Nutch
===============================
diff --git a/src/plugin/indexer-rabbit/ivy.xml
b/src/plugin/indexer-rabbit/ivy.xml
index dd450cf7f..d2daf91da 100644
--- a/src/plugin/indexer-rabbit/ivy.xml
+++ b/src/plugin/indexer-rabbit/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/indexer-solr/README.md
b/src/plugin/indexer-solr/README.md
index c3a4601e1..3a27e4116 100644
--- a/src/plugin/indexer-solr/README.md
+++ b/src/plugin/indexer-solr/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
indexer-solr plugin for Nutch
=============================
diff --git a/src/plugin/indexer-solr/howto_upgrade_solr.txt
b/src/plugin/indexer-solr/howto_upgrade_solr.md
similarity index 58%
rename from src/plugin/indexer-solr/howto_upgrade_solr.txt
rename to src/plugin/indexer-solr/howto_upgrade_solr.md
index b2a7eb5c8..905fb84a9 100644
--- a/src/plugin/indexer-solr/howto_upgrade_solr.txt
+++ b/src/plugin/indexer-solr/howto_upgrade_solr.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
1. Upgrade Solr dependency in src/plugin/indexer-solr/ivy.xml
2. Upgrade the Solr specific dependencies in src/plugin/indexer-solr/plugin.xml
diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml
index ce59942da..ab5fd72c7 100644
--- a/src/plugin/indexer-solr/ivy.xml
+++ b/src/plugin/indexer-solr/ivy.xml
@@ -1,15 +1,20 @@
<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more
contributor
- license agreements. See the NOTICE file distributed with this work for
additional
- information regarding copyright ownership. The ASF licenses this file
to
- You under the Apache License, Version 2.0 (the "License"); you may not
use
- this file except in compliance with the License. You may obtain a copy
of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless
required
- by applicable law or agreed to in writing, software distributed under
the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS
- OF ANY KIND, either express or implied. See the License for the
specific
- language governing permissions and limitations under the License. -->
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<ivy-module version="1.0">
<info organisation="org.apache.nutch" module="${ant.project.name}">
diff --git a/src/plugin/indexer-solr/plugin.xml
b/src/plugin/indexer-solr/plugin.xml
index f672ac9ed..21cc7d8bd 100644
--- a/src/plugin/indexer-solr/plugin.xml
+++ b/src/plugin/indexer-solr/plugin.xml
@@ -1,14 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more
contributor
- license agreements. See the NOTICE file distributed with this work for
additional
- information regarding copyright ownership. The ASF licenses this file
to
- You under the Apache License, Version 2.0 (the "License"); you may not
use
- this file except in compliance with the License. You may obtain a copy
of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless
required
- by applicable law or agreed to in writing, software distributed under
the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS
- OF ANY KIND, either express or implied. See the License for the
specific
- language governing permissions and limitations under the License. -->
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<plugin id="indexer-solr" name="SolrIndexWriter" version="1.0.0"
provider-name="nutch.apache.org">
diff --git a/src/plugin/language-identifier/ivy.xml
b/src/plugin/language-identifier/ivy.xml
index 68e9ed76e..f64b97055 100644
--- a/src/plugin/language-identifier/ivy.xml
+++ b/src/plugin/language-identifier/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml
index b03211667..795e6b335 100644
--- a/src/plugin/lib-htmlunit/ivy.xml
+++ b/src/plugin/lib-htmlunit/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/lib-http/ivy.xml b/src/plugin/lib-http/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/lib-http/ivy.xml
+++ b/src/plugin/lib-http/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/lib-nekohtml/ivy.xml b/src/plugin/lib-nekohtml/ivy.xml
index 072fb05b9..32fcd8c4b 100644
--- a/src/plugin/lib-nekohtml/ivy.xml
+++ b/src/plugin/lib-nekohtml/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/lib-rabbitmq/ivy.xml b/src/plugin/lib-rabbitmq/ivy.xml
index 1b6ceac37..8184530af 100644
--- a/src/plugin/lib-rabbitmq/ivy.xml
+++ b/src/plugin/lib-rabbitmq/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/lib-regex-filter/ivy.xml
b/src/plugin/lib-regex-filter/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/lib-regex-filter/ivy.xml
+++ b/src/plugin/lib-regex-filter/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/lib-selenium/README.md
b/src/plugin/lib-selenium/README.md
index 1c6b37c5f..5054d7ad8 100644
--- a/src/plugin/lib-selenium/README.md
+++ b/src/plugin/lib-selenium/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
# Updates
* The use of phantomjs has been deprecated. Check
[Wikipedia](https://en.wikipedia.org/wiki/PhantomJS) for more info.
* The updated code for Safari webriver is under development as starting Safari
10 on OS X El Capitan and macOS Sierra, Safari comes bundled with a new driver
implementation.
diff --git a/src/plugin/index-geoip/plugin.xml
b/src/plugin/lib-selenium/howto_upgrade_selenium.md
similarity index 52%
copy from src/plugin/index-geoip/plugin.xml
copy to src/plugin/lib-selenium/howto_upgrade_selenium.md
index c4efadf94..3071c74cb 100644
--- a/src/plugin/index-geoip/plugin.xml
+++ b/src/plugin/lib-selenium/howto_upgrade_selenium.md
@@ -14,31 +14,19 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
-<plugin
- id="index-geoip"
- name="GeoIP2 Indexing Filter"
- version="1.0.0"
- provider-name="nutch.org">
-
-
- <runtime>
- <library name="index-geoip.jar">
- <export name="*"/>
- </library>
- <library name="geoip2-3.0.1.jar"/>
- <library name="maxmind-db-2.0.0.jar"/>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.indexer.geoip"
- name="Nutch GeoIP2 Indexing Filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="GeoIPIndexingFilter"
-
class="org.apache.nutch.indexer.geoip.GeoIPIndexingFilter"/>
- </extension>
-
-</plugin>
+1. Upgrade various driver versions dependency in
src/plugin/lib-selenium/ivy.xml
+
+2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
+
+ To get a list of dependencies and their versions execute:
+ $ ant -f ./build-ivy.xml
+ $ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export
name="*"\/>\n <\/library>/g'
+
+ Note that all dependent libraries are exported for a "library" plugin
("lib-selenium").
+
+ N.B. The above Regex + Sed commands may not work if you are using MacOSX's
Sed. In this instance you can instal GNU Sed as follows
+
+ $ brew install gnu-sed --with-default-names
+
+ You can then restart your terminal and the Regex + Sed command should work
just fine!
diff --git a/src/plugin/lib-selenium/howto_upgrade_selenium.txt
b/src/plugin/lib-selenium/howto_upgrade_selenium.txt
deleted file mode 100644
index 1892a6275..000000000
--- a/src/plugin/lib-selenium/howto_upgrade_selenium.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-1. Upgrade various driver versions dependency in
src/plugin/lib-selenium/ivy.xml
-
-2. Upgrade Selenium's own dependencies in src/plugin/lib-selenium/plugin.xml
-
- To get a list of dependencies and their versions execute:
- $ ant -f ./build-ivy.xml
- $ ls lib | sed 's/^/ <library name="/g' | sed 's/$/">\n <export
name="*"\/>\n <\/library>/g'
-
- Note that all dependent libraries are exported for a "library" plugin
("lib-selenium").
-
- N.B. The above Regex + Sed commands may not work if you are using MacOSX's
Sed. In this instance you can instal GNU Sed as follows
-
- $ brew install gnu-sed --with-default-names
-
- You can then restart your terminal and the Regex + Sed command should work
just fine!
diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml
index 7d3a2d624..0d460cdb4 100644
--- a/src/plugin/lib-selenium/ivy.xml
+++ b/src/plugin/lib-selenium/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/lib-xml/ivy.xml b/src/plugin/lib-xml/ivy.xml
index 9306c4d9b..4e38c4371 100644
--- a/src/plugin/lib-xml/ivy.xml
+++ b/src/plugin/lib-xml/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/microformats-reltag/ivy.xml
b/src/plugin/microformats-reltag/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/microformats-reltag/ivy.xml
+++ b/src/plugin/microformats-reltag/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/mimetype-filter/ivy.xml
b/src/plugin/mimetype-filter/ivy.xml
index 624dcaf4a..3d4fc905c 100644
--- a/src/plugin/mimetype-filter/ivy.xml
+++ b/src/plugin/mimetype-filter/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/nutch-extensionpoints/ivy.xml
b/src/plugin/nutch-extensionpoints/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/nutch-extensionpoints/ivy.xml
+++ b/src/plugin/nutch-extensionpoints/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/parse-ext/command b/src/plugin/parse-ext/command
index f42c05531..329d58d96 100644
--- a/src/plugin/parse-ext/command
+++ b/src/plugin/parse-ext/command
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
#!/bin/bash
#
# Sample bash script as external command invoked by parse-ext plugin
diff --git a/src/plugin/parse-ext/ivy.xml b/src/plugin/parse-ext/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/parse-ext/ivy.xml
+++ b/src/plugin/parse-ext/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/parse-html/ivy.xml b/src/plugin/parse-html/ivy.xml
index 69aa2eba5..1424c4d7a 100644
--- a/src/plugin/parse-html/ivy.xml
+++ b/src/plugin/parse-html/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/parse-js/ivy.xml b/src/plugin/parse-js/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/parse-js/ivy.xml
+++ b/src/plugin/parse-js/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/parse-js/sample/parse_embedded_js_test.html
b/src/plugin/parse-js/sample/parse_embedded_js_test.html
index 351beacc3..0409bba53 100644
--- a/src/plugin/parse-js/sample/parse_embedded_js_test.html
+++ b/src/plugin/parse-js/sample/parse_embedded_js_test.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html style="font-size: 16px;"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
diff --git a/src/plugin/parse-js/sample/parse_pure_js_test.js
b/src/plugin/parse-js/sample/parse_pure_js_test.js
index f196313f8..0e486a879 100644
--- a/src/plugin/parse-js/sample/parse_pure_js_test.js
+++ b/src/plugin/parse-js/sample/parse_pure_js_test.js
@@ -1,3 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
// test data for link extraction from "pure" JavaScript
function selectProvider(form) {
diff --git a/src/plugin/parse-metatags/ivy.xml
b/src/plugin/parse-metatags/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/parse-metatags/ivy.xml
+++ b/src/plugin/parse-metatags/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/parse-metatags/sample/testMetatags.html
b/src/plugin/parse-metatags/sample/testMetatags.html
index e9e8e6bd0..4dc86c194 100644
--- a/src/plugin/parse-metatags/sample/testMetatags.html
+++ b/src/plugin/parse-metatags/sample/testMetatags.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<html>
<head>
<meta name="Keywords" content="This is a test of keywords" />
diff --git a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
index ca8b737c2..36d2c8814 100644
--- a/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
+++ b/src/plugin/parse-metatags/sample/testMultivalueMetatags.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<html>
<head>
<meta name="DC.creator" content="Doug Cutting">
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt
b/src/plugin/parse-tika/howto_upgrade_tika.md
similarity index 73%
rename from src/plugin/parse-tika/howto_upgrade_tika.txt
rename to src/plugin/parse-tika/howto_upgrade_tika.md
index 46d075948..8ed6c3f3c 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
We are currently using a shim (https://github.com/tballison/hadoop-safe-tika
because of binary conflicts in commons-io versions between what Hadoop
supports and the more
modern features that Apache Tika and Apache POI were using in commons-io.
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index 1586d9661..b89e812e1 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/parse-tika/sample/nutch.html
b/src/plugin/parse-tika/sample/nutch.html
index 0aa7c9895..809853512 100644
--- a/src/plugin/parse-tika/sample/nutch.html
+++ b/src/plugin/parse-tika/sample/nutch.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
diff --git a/src/plugin/parse-zip/ivy.xml b/src/plugin/parse-zip/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/parse-zip/ivy.xml
+++ b/src/plugin/parse-zip/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/parsefilter-debug/ivy.xml
b/src/plugin/parsefilter-debug/ivy.xml
index dac80e6d7..82f93c012 100644
--- a/src/plugin/parsefilter-debug/ivy.xml
+++ b/src/plugin/parsefilter-debug/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/parsefilter-naivebayes/ivy.xml
b/src/plugin/parsefilter-naivebayes/ivy.xml
index c261adac6..66a931543 100644
--- a/src/plugin/parsefilter-naivebayes/ivy.xml
+++ b/src/plugin/parsefilter-naivebayes/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
b/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
index 9d15cd899..fbc7dd303 100644
--- a/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
+++ b/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# Example configuration file for parsefilter-regex
#
# Parse metadata field <name> is set to true if the HTML matches the regex. The
diff --git a/src/plugin/parsefilter-regex/ivy.xml
b/src/plugin/parsefilter-regex/ivy.xml
index e82f92861..f33a31178 100644
--- a/src/plugin/parsefilter-regex/ivy.xml
+++ b/src/plugin/parsefilter-regex/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-file/ivy.xml b/src/plugin/protocol-file/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/protocol-file/ivy.xml
+++ b/src/plugin/protocol-file/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-file/sample/testprotocolfile.txt
b/src/plugin/protocol-file/sample/testprotocolfile.txt
index fbe8a8acf..5e684e2f4 100644
--- a/src/plugin/protocol-file/sample/testprotocolfile.txt
+++ b/src/plugin/protocol-file/sample/testprotocolfile.txt
@@ -1 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
Protocol File Test
diff --git a/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
b/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
index fbe8a8acf..5e684e2f4 100644
--- a/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
+++ b/src/plugin/protocol-file/sample/testprotocolfile_(encoded).txt
@@ -1 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
Protocol File Test
diff --git a/src/plugin/protocol-foo/ivy.xml b/src/plugin/protocol-foo/ivy.xml
index 1a86d6803..99b173446 100755
--- a/src/plugin/protocol-foo/ivy.xml
+++ b/src/plugin/protocol-foo/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-foo/plugin.xml
b/src/plugin/protocol-foo/plugin.xml
index d34f6242a..954a2d41a 100755
--- a/src/plugin/protocol-foo/plugin.xml
+++ b/src/plugin/protocol-foo/plugin.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-ftp/ivy.xml b/src/plugin/protocol-ftp/ivy.xml
index 8e1c257d6..1fbfe97f0 100644
--- a/src/plugin/protocol-ftp/ivy.xml
+++ b/src/plugin/protocol-ftp/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-htmlunit/ivy.xml
b/src/plugin/protocol-htmlunit/ivy.xml
index dde1fe88f..fa787376b 100644
--- a/src/plugin/protocol-htmlunit/ivy.xml
+++ b/src/plugin/protocol-htmlunit/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-http/ivy.xml b/src/plugin/protocol-http/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/protocol-http/ivy.xml
+++ b/src/plugin/protocol-http/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-httpclient/ivy.xml
b/src/plugin/protocol-httpclient/ivy.xml
index 378bd7c42..e3e515dd9 100644
--- a/src/plugin/protocol-httpclient/ivy.xml
+++ b/src/plugin/protocol-httpclient/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-interactiveselenium/README.md
b/src/plugin/protocol-interactiveselenium/README.md
index dd43ee794..545efb830 100644
--- a/src/plugin/protocol-interactiveselenium/README.md
+++ b/src/plugin/protocol-interactiveselenium/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
Nutch Interactive Selenium
==========================
diff --git a/src/plugin/protocol-interactiveselenium/ivy.xml
b/src/plugin/protocol-interactiveselenium/ivy.xml
index 506be0aec..112483bcd 100644
--- a/src/plugin/protocol-interactiveselenium/ivy.xml
+++ b/src/plugin/protocol-interactiveselenium/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-okhttp/howto_upgrade_okhttp.txt
b/src/plugin/protocol-okhttp/howto_upgrade_okhttp.md
similarity index 52%
rename from src/plugin/protocol-okhttp/howto_upgrade_okhttp.txt
rename to src/plugin/protocol-okhttp/howto_upgrade_okhttp.md
index b3b6f1f22..16ae70d71 100644
--- a/src/plugin/protocol-okhttp/howto_upgrade_okhttp.txt
+++ b/src/plugin/protocol-okhttp/howto_upgrade_okhttp.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
1. Upgrade OkHttp dependency in src/plugin/protocol-okhttp/ivy.xml
2. Upgrade OkHttp's own dependencies in src/plugin/protocol-okhttp/plugin.xml
diff --git a/src/plugin/protocol-okhttp/ivy.xml
b/src/plugin/protocol-okhttp/ivy.xml
index ead823247..73b4fa636 100644
--- a/src/plugin/protocol-okhttp/ivy.xml
+++ b/src/plugin/protocol-okhttp/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/protocol-selenium/README.md
b/src/plugin/protocol-selenium/README.md
index 05132b9ef..4d43c330d 100644
--- a/src/plugin/protocol-selenium/README.md
+++ b/src/plugin/protocol-selenium/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
Nutch Selenium
==============
diff --git a/src/plugin/protocol-selenium/ivy.xml
b/src/plugin/protocol-selenium/ivy.xml
index 506be0aec..112483bcd 100644
--- a/src/plugin/protocol-selenium/ivy.xml
+++ b/src/plugin/protocol-selenium/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/publish-rabbitmq/ivy.xml
b/src/plugin/publish-rabbitmq/ivy.xml
index 7b5e3dd3c..008cdb1ca 100644
--- a/src/plugin/publish-rabbitmq/ivy.xml
+++ b/src/plugin/publish-rabbitmq/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/scoring-depth/ivy.xml b/src/plugin/scoring-depth/ivy.xml
index 1275664e5..cb5a0f186 100644
--- a/src/plugin/scoring-depth/ivy.xml
+++ b/src/plugin/scoring-depth/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/scoring-link/ivy.xml b/src/plugin/scoring-link/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/scoring-link/ivy.xml
+++ b/src/plugin/scoring-link/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/scoring-metadata/ivy.xml
b/src/plugin/scoring-metadata/ivy.xml
index 24d76063d..6fa1a2c06 100644
--- a/src/plugin/scoring-metadata/ivy.xml
+++ b/src/plugin/scoring-metadata/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/scoring-opic/ivy.xml b/src/plugin/scoring-opic/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/scoring-opic/ivy.xml
+++ b/src/plugin/scoring-opic/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/scoring-orphan/ivy.xml
b/src/plugin/scoring-orphan/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/scoring-orphan/ivy.xml
+++ b/src/plugin/scoring-orphan/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/scoring-similarity/ivy.xml
b/src/plugin/scoring-similarity/ivy.xml
index 1acd1d442..1a1945f57 100644
--- a/src/plugin/scoring-similarity/ivy.xml
+++ b/src/plugin/scoring-similarity/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/subcollection/ivy.xml b/src/plugin/subcollection/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/subcollection/ivy.xml
+++ b/src/plugin/subcollection/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/tld/ivy.xml b/src/plugin/tld/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/tld/ivy.xml
+++ b/src/plugin/tld/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-automaton/ivy.xml
b/src/plugin/urlfilter-automaton/ivy.xml
index 6b07ba33b..e9b1e892f 100644
--- a/src/plugin/urlfilter-automaton/ivy.xml
+++ b/src/plugin/urlfilter-automaton/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-domain/data/hosts.txt
b/src/plugin/urlfilter-domain/data/hosts.txt
index 2b88c3b05..8cf43745f 100644
--- a/src/plugin/urlfilter-domain/data/hosts.txt
+++ b/src/plugin/urlfilter-domain/data/hosts.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# comments start with the pound sign
net
apache.org
diff --git a/src/plugin/urlfilter-domain/ivy.xml
b/src/plugin/urlfilter-domain/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlfilter-domain/ivy.xml
+++ b/src/plugin/urlfilter-domain/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-domaindenylist/data/hosts.txt
b/src/plugin/urlfilter-domaindenylist/data/hosts.txt
index 2b88c3b05..8cf43745f 100644
--- a/src/plugin/urlfilter-domaindenylist/data/hosts.txt
+++ b/src/plugin/urlfilter-domaindenylist/data/hosts.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# comments start with the pound sign
net
apache.org
diff --git a/src/plugin/urlfilter-domaindenylist/ivy.xml
b/src/plugin/urlfilter-domaindenylist/ivy.xml
index 1275664e5..cb5a0f186 100644
--- a/src/plugin/urlfilter-domaindenylist/ivy.xml
+++ b/src/plugin/urlfilter-domaindenylist/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-fast/README.md
b/src/plugin/urlfilter-fast/README.md
index 46b293fe8..2e5860575 100644
--- a/src/plugin/urlfilter-fast/README.md
+++ b/src/plugin/urlfilter-fast/README.md
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
Filters URLs based on a file of regular expressions using host/domains
matching first. The default policy is to accept a URL if no matches
diff --git a/src/plugin/urlfilter-fast/ivy.xml
b/src/plugin/urlfilter-fast/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlfilter-fast/ivy.xml
+++ b/src/plugin/urlfilter-fast/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-ignoreexempt/README.md
b/src/plugin/urlfilter-ignoreexempt/README.md
index d48b6729f..a8f932e75 100644
--- a/src/plugin/urlfilter-ignoreexempt/README.md
+++ b/src/plugin/urlfilter-ignoreexempt/README.md
@@ -1,3 +1,20 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
urlfilter-ignoreexempt
======================
This plugin allows certain urls to be exempted when the external links are
configured to be ignored.
diff --git a/src/plugin/urlfilter-ignoreexempt/ivy.xml
b/src/plugin/urlfilter-ignoreexempt/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlfilter-ignoreexempt/ivy.xml
+++ b/src/plugin/urlfilter-ignoreexempt/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-prefix/ivy.xml
b/src/plugin/urlfilter-prefix/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlfilter-prefix/ivy.xml
+++ b/src/plugin/urlfilter-prefix/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-regex/ivy.xml
b/src/plugin/urlfilter-regex/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlfilter-regex/ivy.xml
+++ b/src/plugin/urlfilter-regex/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-suffix/ivy.xml
b/src/plugin/urlfilter-suffix/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlfilter-suffix/ivy.xml
+++ b/src/plugin/urlfilter-suffix/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlfilter-validator/ivy.xml
b/src/plugin/urlfilter-validator/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlfilter-validator/ivy.xml
+++ b/src/plugin/urlfilter-validator/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlmeta/ivy.xml b/src/plugin/urlmeta/ivy.xml
index 1275664e5..cb5a0f186 100644
--- a/src/plugin/urlmeta/ivy.xml
+++ b/src/plugin/urlmeta/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-ajax/ivy.xml
b/src/plugin/urlnormalizer-ajax/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlnormalizer-ajax/ivy.xml
+++ b/src/plugin/urlnormalizer-ajax/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-basic/ivy.xml
b/src/plugin/urlnormalizer-basic/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlnormalizer-basic/ivy.xml
+++ b/src/plugin/urlnormalizer-basic/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-host/data/hosts.txt
b/src/plugin/urlnormalizer-host/data/hosts.txt
index c7e0ccfe6..b81edae14 100644
--- a/src/plugin/urlnormalizer-host/data/hosts.txt
+++ b/src/plugin/urlnormalizer-host/data/hosts.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# Force all sub domains to www.
*.example.com example.com
diff --git a/src/plugin/urlnormalizer-host/ivy.xml
b/src/plugin/urlnormalizer-host/ivy.xml
index 624dcaf4a..3d4fc905c 100644
--- a/src/plugin/urlnormalizer-host/ivy.xml
+++ b/src/plugin/urlnormalizer-host/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-pass/ivy.xml
b/src/plugin/urlnormalizer-pass/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlnormalizer-pass/ivy.xml
+++ b/src/plugin/urlnormalizer-pass/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-protocol/data/protocols.txt
b/src/plugin/urlnormalizer-protocol/data/protocols.txt
index fc7d86cbd..159917252 100644
--- a/src/plugin/urlnormalizer-protocol/data/protocols.txt
+++ b/src/plugin/urlnormalizer-protocol/data/protocols.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# Example configuration file for urlnormalizer-protocol
#
# URL's of hosts listed in the configuration are normalized to the target
diff --git a/src/plugin/urlnormalizer-protocol/ivy.xml
b/src/plugin/urlnormalizer-protocol/ivy.xml
index 624dcaf4a..3d4fc905c 100644
--- a/src/plugin/urlnormalizer-protocol/ivy.xml
+++ b/src/plugin/urlnormalizer-protocol/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-querystring/ivy.xml
b/src/plugin/urlnormalizer-querystring/ivy.xml
index 624dcaf4a..3d4fc905c 100644
--- a/src/plugin/urlnormalizer-querystring/ivy.xml
+++ b/src/plugin/urlnormalizer-querystring/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-regex/ivy.xml
b/src/plugin/urlnormalizer-regex/ivy.xml
index 956fd25ef..5c2c5b77e 100644
--- a/src/plugin/urlnormalizer-regex/ivy.xml
+++ b/src/plugin/urlnormalizer-regex/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
index 7867ad80e..8560961c0 100644
--- a/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
+++ b/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# test simple removal of session id, keeping parameters before and after
http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03
http://foo.com/foo.php
http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03
http://foo.com/foo.php?f=2
diff --git a/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test
b/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test
index 9d928802e..9905e683d 100644
--- a/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test
+++ b/src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# test removal of subdomains
http://www.foo.bar.com/ http://bar.com/
diff --git a/src/plugin/urlnormalizer-slash/data/slashes.txt
b/src/plugin/urlnormalizer-slash/data/slashes.txt
index d3bd70a66..efcdafb63 100644
--- a/src/plugin/urlnormalizer-slash/data/slashes.txt
+++ b/src/plugin/urlnormalizer-slash/data/slashes.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# Both domains have duplicate URL's, some with slashes and some without
# We prefer this domain with slashes
diff --git a/src/plugin/urlnormalizer-slash/ivy.xml
b/src/plugin/urlnormalizer-slash/ivy.xml
index 624dcaf4a..3d4fc905c 100644
--- a/src/plugin/urlnormalizer-slash/ivy.xml
+++ b/src/plugin/urlnormalizer-slash/ivy.xml
@@ -1,5 +1,4 @@
<?xml version="1.0" ?>
-
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
diff --git a/src/test/crawl-tests.xml b/src/test/crawl-tests.xml
index 01fc68301..b1e38ad3a 100644
--- a/src/test/crawl-tests.xml
+++ b/src/test/crawl-tests.xml
@@ -1,4 +1,20 @@
<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!-- Configuration overrides used during unit tests. -->
diff --git a/src/test/filter-all.txt b/src/test/filter-all.txt
index 4ed567ab1..d738aec76 100644
--- a/src/test/filter-all.txt
+++ b/src/test/filter-all.txt
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# Config file for urlfilter-suffix plugin
# Filter away all urls
diff --git a/src/test/log4j.properties b/src/test/log4j.properties
index 3ff115f46..08e272c71 100644
--- a/src/test/log4j.properties
+++ b/src/test/log4j.properties
@@ -1,3 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
# log4j configuration used during build and unit tests
log4j.rootLogger=info,stdout
diff --git a/src/test/nutch-site.xml b/src/test/nutch-site.xml
index dd408739d..0d6177e5e 100644
--- a/src/test/nutch-site.xml
+++ b/src/test/nutch-site.xml
@@ -1,4 +1,20 @@
<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!-- Configuration overrides used during unit tests. -->
diff --git a/src/testresources/fetch-test-site/dup_of_pagea.html
b/src/testresources/fetch-test-site/dup_of_pagea.html
index 6444c4122..63c4e6153 100644
--- a/src/testresources/fetch-test-site/dup_of_pagea.html
+++ b/src/testresources/fetch-test-site/dup_of_pagea.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<html>
<head>
<title>page a</title>
diff --git a/src/testresources/fetch-test-site/exception.html
b/src/testresources/fetch-test-site/exception.html
index e1192a176..66f134ee2 100644
--- a/src/testresources/fetch-test-site/exception.html
+++ b/src/testresources/fetch-test-site/exception.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML>
<HEAD>
diff --git a/src/testresources/fetch-test-site/index.html
b/src/testresources/fetch-test-site/index.html
index d73ff3f69..3fc6e61e5 100644
--- a/src/testresources/fetch-test-site/index.html
+++ b/src/testresources/fetch-test-site/index.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<html>
<head>
<title>front page</title>
diff --git a/src/testresources/fetch-test-site/nested_spider_trap.html
b/src/testresources/fetch-test-site/nested_spider_trap.html
index 5dcf7c220..dd32ee236 100644
--- a/src/testresources/fetch-test-site/nested_spider_trap.html
+++ b/src/testresources/fetch-test-site/nested_spider_trap.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<html>
<head>
<title>nested spider trap</title>
diff --git a/src/testresources/fetch-test-site/pagea.html
b/src/testresources/fetch-test-site/pagea.html
index 6444c4122..63c4e6153 100644
--- a/src/testresources/fetch-test-site/pagea.html
+++ b/src/testresources/fetch-test-site/pagea.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<html>
<head>
<title>page a</title>
diff --git a/src/testresources/fetch-test-site/pageb.html
b/src/testresources/fetch-test-site/pageb.html
index 66e3725ef..cf77ff4f7 100644
--- a/src/testresources/fetch-test-site/pageb.html
+++ b/src/testresources/fetch-test-site/pageb.html
@@ -1,3 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
<html>
<head>
<title>bage b</title>
diff --git a/src/testresources/fetch-test-site/robots.txt
b/src/testresources/fetch-test-site/robots.txt
index e69de29bb..fc590f973 100644
--- a/src/testresources/fetch-test-site/robots.txt
+++ b/src/testresources/fetch-test-site/robots.txt
@@ -0,0 +1,14 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file