This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4361 in repository https://gitbox.apache.org/repos/asf/tika.git
commit bef786ed0cd777cc966384f7948a2c1d039e5dcd Author: tallison <[email protected]> AuthorDate: Wed Dec 4 15:35:37 2024 -0500 TIKA-4361 --- .../tika/parser/microsoft/rtf/TextExtractor.java | 44 ++++++++++++---------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java index 83abb1ae6..e5a6ae6b4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java @@ -941,6 +941,7 @@ final class TextExtractor { } else { // In document if (equals("b")) { + //TIKA-4361 -- need to make sure we're not in an href? // b0 assert param == 0; if (groupState.bold) { @@ -1085,6 +1086,9 @@ final class TextExtractor { } private void end(String tag) throws IOException, SAXException, TikaException { + if ("b".equals(tag)) { + System.out.println("ending b"); + } out.endElement(XHTML, tag, tag); } @@ -1479,29 +1483,31 @@ final class TextExtractor { if (groupStates.size() > 0) { // Restore group state: final GroupState outerGroupState = groupStates.removeLast(); - - // Close italic, if outer does not have italic or - // bold changed: - if (groupState.italic) { - if (!outerGroupState.italic || groupState.bold != outerGroupState.bold) { - end("i"); - groupState.italic = false; + //only modify styles if we're not in a hyperlink + if (fieldState == 0) { + // Close italic, if outer does not have italic or + // bold changed: + if (groupState.italic) { + if (!outerGroupState.italic || groupState.bold != outerGroupState.bold) { + end("i"); + groupState.italic = false; + } } - } - // Close bold - if (groupState.bold && !outerGroupState.bold) { - end("b"); - } + // Close bold + if (groupState.bold && !outerGroupState.bold) { + end("b"); + } - // Open bold - if (!groupState.bold && outerGroupState.bold) { - start("b"); - } + // Open bold + if (!groupState.bold && outerGroupState.bold) { + start("b"); + } - // Open italic - if (!groupState.italic && outerGroupState.italic) { - start("i"); + // Open italic + if (!groupState.italic && outerGroupState.italic) { + start("i"); + } } groupState = outerGroupState; }
