This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch TIKA-4646-instrText-hyperlinks
in repository https://gitbox.apache.org/repos/asf/tika.git
from 5554438548 TIKA-4646 -- extract hyperlinks from instrText fields in
ooxml docx
add 1926cbb449 TIKA-4646 -- improve hyperlink extraction from ooxml
No new revisions were added by this update.
Summary of changes:
.../main/java/org/apache/tika/metadata/Office.java | 14 ++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 26 ++
.../microsoft/ooxml/FieldHyperlinkTracker.java | 168 +++++++++++++
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 12 +
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 118 ++++++++-
.../ooxml/SXWPFWordExtractorDecorator.java | 2 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 266 +++++++++++++++++++++
.../ooxml/XWPFWordExtractorDecorator.java | 84 ++-----
.../tika/parser/microsoft/ExcelParserTest.java | 24 ++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 26 ++
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 47 ++++
.../test-documents/testDataConnections.xlsx | Bin 0 -> 2967 bytes
.../resources/test-documents/testExternalRefs.docx | Bin 0 -> 2125 bytes
.../resources/test-documents/testHoverAndVml.docx | Bin 0 -> 2270 bytes
14 files changed, 716 insertions(+), 71 deletions(-)
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FieldHyperlinkTracker.java
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testDataConnections.xlsx
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testExternalRefs.docx
create mode 100644
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testHoverAndVml.docx