This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8adac3eac TIKA-4361 (#2075)
8adac3eac is described below
commit 8adac3eac3c5e3e4d4de96ef8092c487a1281361
Author: Tim Allison <[email protected]>
AuthorDate: Wed Dec 4 16:12:44 2024 -0500
TIKA-4361 (#2075)
---
.../tika/parser/microsoft/rtf/TextExtractor.java | 44 ++++++++++++----------
1 file changed, 25 insertions(+), 19 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index 83abb1ae6..e5a6ae6b4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -941,6 +941,7 @@ final class TextExtractor {
} else {
// In document
if (equals("b")) {
+ //TIKA-4361 -- need to make sure we're not in an href?
// b0
assert param == 0;
if (groupState.bold) {
@@ -1085,6 +1086,9 @@ final class TextExtractor {
}
private void end(String tag) throws IOException, SAXException,
TikaException {
+ if ("b".equals(tag)) {
+ System.out.println("ending b");
+ }
out.endElement(XHTML, tag, tag);
}
@@ -1479,29 +1483,31 @@ final class TextExtractor {
if (groupStates.size() > 0) {
// Restore group state:
final GroupState outerGroupState = groupStates.removeLast();
-
- // Close italic, if outer does not have italic or
- // bold changed:
- if (groupState.italic) {
- if (!outerGroupState.italic || groupState.bold !=
outerGroupState.bold) {
- end("i");
- groupState.italic = false;
+ //only modify styles if we're not in a hyperlink
+ if (fieldState == 0) {
+ // Close italic, if outer does not have italic or
+ // bold changed:
+ if (groupState.italic) {
+ if (!outerGroupState.italic || groupState.bold !=
outerGroupState.bold) {
+ end("i");
+ groupState.italic = false;
+ }
}
- }
- // Close bold
- if (groupState.bold && !outerGroupState.bold) {
- end("b");
- }
+ // Close bold
+ if (groupState.bold && !outerGroupState.bold) {
+ end("b");
+ }
- // Open bold
- if (!groupState.bold && outerGroupState.bold) {
- start("b");
- }
+ // Open bold
+ if (!groupState.bold && outerGroupState.bold) {
+ start("b");
+ }
- // Open italic
- if (!groupState.italic && outerGroupState.italic) {
- start("i");
+ // Open italic
+ if (!groupState.italic && outerGroupState.italic) {
+ start("i");
+ }
}
groupState = outerGroupState;
}