Repository: tika Updated Branches: refs/heads/master 636060eb6 -> 8d29f7a62
TIKA-2030 - add processing for <text:s/> element in odt, thanks to David Pilato for identifying this. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c0320f14 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c0320f14 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c0320f14 Branch: refs/heads/master Commit: c0320f14194608d31b9ffaae9250f28c46017b75 Parents: 95b2cd1 Author: tballison <talli...@mitre.org> Authored: Fri Jul 8 14:15:50 2016 -0400 Committer: tballison <talli...@mitre.org> Committed: Fri Jul 8 14:15:50 2016 -0400 ---------------------------------------------------------------------- .../parser/odf/OpenDocumentContentParser.java | 3 +++ .../test-documents/testOpenOffice2.odt | Bin 26448 -> 27554 bytes 2 files changed, 3 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/c0320f14/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java index a32d406..b40ed27 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java @@ -68,6 +68,7 @@ public class OpenDocumentContentParser extends AbstractParser { private static final class OpenDocumentElementMappingContentHandler extends ElementMappingContentHandler { + private static final char[] SPACE = new char[]{ ' '}; private final ContentHandler handler; private final BitSet textNodeStack = new BitSet(); private int nodeDepth = 0; @@ -283,6 +284,8 @@ public class OpenDocumentContentParser extends AbstractParser { startList(attrs.getValue(TEXT_NS, "style-name")); } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { startSpan(attrs.getValue(TEXT_NS, "style-name")); + } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) { + handler.characters(SPACE, 0, 1); } else { super.startElement(namespaceURI, localName, qName, attrs); } http://git-wip-us.apache.org/repos/asf/tika/blob/c0320f14/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt b/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt index bc31925..f6c72b6 100644 Binary files a/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt and b/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt differ