This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
from caf312593 TIKA-4135 -- remove xerces2 as a dependency (#1360)
add 5361b6d12 TIKA-1599 (#1356)
No new revisions were added by this update.
Summary of changes:
CHANGES.txt | 9 +-
pom.xml | 2 +
.../test/java/org/apache/tika/cli/TikaCLITest.java | 4 +-
.../src/test/resources/test-data/tika-config1.xml | 2 +-
tika-bom/pom.xml | 11 +-
.../org/apache/tika/example/TIAParsingExample.java | 6 +-
tika-parent/pom.xml | 5 +
.../src/test/resources/2.4.0-no-tesseract.txt | 8 +-
.../src/test/resources/2.4.0-tesseract.txt | 8 +-
.../src/test/resources/2.4.1-no-tesseract.txt | 8 +-
.../src/test/resources/2.4.1-tesseract.txt | 8 +-
.../tika-parser-tagsoup-module/pom.xml | 34 +++
.../tika/parser/html/tagsoup}/DataURIScheme.java | 2 +-
.../html/tagsoup}/DataURISchemeParseException.java | 2 +-
.../parser/html/tagsoup}/DataURISchemeUtil.java | 2 +-
.../parser/html/tagsoup}/DefaultHtmlMapper.java | 2 +-
.../parser/html/tagsoup}/HtmlEncodingDetector.java | 2 +-
.../tika/parser/html/tagsoup}/HtmlHandler.java | 2 +-
.../tika/parser/html/tagsoup}/HtmlMapper.java | 2 +-
.../tika/parser/html/tagsoup}/HtmlParser.java | 2 +-
.../parser/html/tagsoup}/IdentityHtmlMapper.java | 2 +-
.../html/tagsoup}/XHTMLDowngradeHandler.java | 2 +-
.../tagsoup}/charsetdetector/CharsetAliases.java | 6 +-
.../charsetdetector/CharsetDetectionResult.java | 2 +-
.../tagsoup}/charsetdetector/MetaProcessor.java | 6 +-
.../html/tagsoup}/charsetdetector/PreScanner.java | 2 +-
.../StandardHtmlEncodingDetector.java | 6 +-
.../charsets/ReplacementCharset.java | 2 +-
.../charsets/XUserDefinedCharset.java | 2 +-
.../org.apache.tika.detect.EncodingDetector | 2 +-
.../services/org.apache.tika.parser.Parser | 2 +-
.../StandardCharsets_unsupported_by_IANA.txt | 0
.../html/tagsoup}/DataURISchemeParserTest.java | 3 +-
.../html/tagsoup}/HtmlEncodingDetectorTest.java | 3 +-
.../tika/parser/html/tagsoup}/HtmlParserTest.java | 5 +-
.../tika/parser/html/tagsoup}/SrcDocTest.java | 2 +-
.../tagsoup}/StandardHtmlEncodingDetectorTest.java | 6 +-
.../org/apache/tika/parser/html/tika-config.xml | 4 +-
.../resources/test-documents/big-preamble.html | 0
.../test-documents/boilerplate-whitespace.html | 0
.../test/resources/test-documents/boilerplate.html | 0
.../testBoilerplateMissingSpace.html | 0
.../test/resources/test-documents/testHTML.html | 0
.../test-documents/testHTMLBadScript.html | 0
.../test-documents/testHTMLGoodScript.html | 0
.../testHTMLNoisyMetaEncoding_1.html | 0
.../testHTMLNoisyMetaEncoding_2.html | 0
.../testHTMLNoisyMetaEncoding_3.html | 0
.../testHTMLNoisyMetaEncoding_4.html | 0
.../test-documents/testHTML_charset_utf16le.html | Bin
.../test-documents/testHTML_charset_utf8.html | 0
.../testHTML_embedded_data_uri_js.html | 0
.../test-documents/testHTML_embedded_img.html | 0
.../testHTML_embedded_img_in_js.html | 0
.../resources/test-documents/testHTML_head.html | 0
.../test-documents/testHTML_metadata.html | 0
.../testHTML_metadata_two_titles.html | 0
.../resources/test-documents/testHTML_utf8.html | 0
.../test/resources/test-documents/testSrcDoc.html | 0
.../test-documents/testUserDefinedCharset.mhtml | 0
.../test/resources/test-documents/testXHTML.html | 0
.../src/test/resources/test-documents/tika434.html | 0
.../pom.xml | 46 +---
.../tika-parser-html-module/pom.xml | 5 +-
.../org/apache/tika/parser/html/JSoupParser.java | 243 +++++++++++++++++++++
.../services/org.apache.tika.parser.Parser | 2 +-
.../apache/tika/parser/html/HtmlParserTest.java | 121 ++++------
.../org/apache/tika/parser/html/tika-config.xml | 4 +-
.../tika/parser/mail/MailContentHandler.java | 4 +-
.../tika/parser/microsoft/JackcessExtractor.java | 6 +-
.../tika/parser/microsoft/OutlookExtractor.java | 6 +-
.../tika/parser/microsoft/chm/ChmParser.java | 6 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 2 +-
.../org/apache/tika/sax/BoilerpipeHandlerTest.java | 21 +-
74 files changed, 447 insertions(+), 197 deletions(-)
create mode 100644
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/pom.xml
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/DataURIScheme.java
(98%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/DataURISchemeParseException.java
(95%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/DataURISchemeUtil.java
(98%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/DefaultHtmlMapper.java
(99%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/HtmlEncodingDetector.java
(99%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/HtmlHandler.java
(99%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/HtmlMapper.java
(98%)
rename
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/HtmlParser.java
(99%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/IdentityHtmlMapper.java
(96%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/XHTMLDowngradeHandler.java
(98%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/charsetdetector/CharsetAliases.java
(97%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/charsetdetector/CharsetDetectionResult.java
(97%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/charsetdetector/MetaProcessor.java
(92%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/charsetdetector/PreScanner.java
(99%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/charsetdetector/StandardHtmlEncodingDetector.java
(95%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/charsetdetector/charsets/ReplacementCharset.java
(96%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/java/org/apache/tika/parser/html/tagsoup}/charsetdetector/charsets/XUserDefinedCharset.java
(96%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
(93%)
copy {tika-core/src/test =>
tika-parsers/tika-parsers-extended/tika-parser-tagsoup-module/src/main}/resources/META-INF/services/org.apache.tika.parser.Parser
(94%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/resources/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/main/resources/org/apache/tika/parser/html/tagsoup}/StandardCharsets_unsupported_by_IANA.txt
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup}/DataURISchemeParserTest.java
(96%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup}/HtmlEncodingDetectorTest.java
(97%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup}/HtmlParserTest.java
(99%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup}/SrcDocTest.java
(97%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html
=>
tika-parsers-extended/tika-parser-tagsoup-module/src/test/java/org/apache/tika/parser/html/tagsoup}/StandardHtmlEncodingDetectorTest.java
(98%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/org/apache/tika/parser/html/tika-config.xml
(87%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/big-preamble.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/boilerplate-whitespace.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/boilerplate.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testBoilerplateMissingSpace.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTMLBadScript.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTMLGoodScript.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_1.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_2.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_3.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTMLNoisyMetaEncoding_4.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML_charset_utf16le.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML_charset_utf8.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML_embedded_data_uri_js.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML_embedded_img.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML_embedded_img_in_js.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML_head.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML_metadata.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML_metadata_two_titles.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testHTML_utf8.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testSrcDoc.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testUserDefinedCharset.mhtml
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/testXHTML.html
(100%)
copy
tika-parsers/{tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module
=>
tika-parsers-extended/tika-parser-tagsoup-module}/src/test/resources/test-documents/tika434.html
(100%)
copy tika-parsers/tika-parsers-extended/{tika-parser-sqlite3-package =>
tika-parser-tagsoup-package}/pom.xml (62%)
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/JSoupParser.java