Repository: tika Updated Branches: refs/heads/2.x 573527bbc -> cdfacdb41
TIKA-2030 - add handling for <text:s/> element to ODT parser. Thanks to David Pilato for opening this issue. Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/87e1e23b Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/87e1e23b Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/87e1e23b Branch: refs/heads/2.x Commit: 87e1e23b46dc68b24288b778e040a3aa55e05628 Parents: 2a7e52e Author: tballison <[email protected]> Authored: Fri Jul 8 14:21:16 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Jul 8 14:21:16 2016 -0400 ---------------------------------------------------------------------- .../parser/odf/OpenDocumentContentParser.java | 5 +++++ .../test-documents/testOpenOffice2.odt | Bin 26448 -> 27554 bytes 2 files changed, 5 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/87e1e23b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java index a32d406..a149dd8 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java @@ -68,6 +68,9 @@ public class OpenDocumentContentParser extends AbstractParser { private static final class OpenDocumentElementMappingContentHandler extends ElementMappingContentHandler { + + private static final char[] SPACE = new char[]{ ' '}; + private final ContentHandler handler; private final BitSet textNodeStack = new BitSet(); private int nodeDepth = 0; @@ -283,6 +286,8 @@ public class OpenDocumentContentParser extends AbstractParser { startList(attrs.getValue(TEXT_NS, "style-name")); } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { startSpan(attrs.getValue(TEXT_NS, "style-name")); + } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) { + handler.characters(SPACE, 0, 1); } else { super.startElement(namespaceURI, localName, qName, attrs); } http://git-wip-us.apache.org/repos/asf/tika/blob/87e1e23b/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt ---------------------------------------------------------------------- diff --git a/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt b/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt index bc31925..f6c72b6 100644 Binary files a/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt and b/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt differ
