Repository: tika
Updated Branches:
  refs/heads/2.x 573527bbc -> cdfacdb41


TIKA-2030 - add handling for <text:s/> element to ODT parser. Thanks to David 
Pilato for opening this issue.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/87e1e23b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/87e1e23b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/87e1e23b

Branch: refs/heads/2.x
Commit: 87e1e23b46dc68b24288b778e040a3aa55e05628
Parents: 2a7e52e
Author: tballison <[email protected]>
Authored: Fri Jul 8 14:21:16 2016 -0400
Committer: tballison <[email protected]>
Committed: Fri Jul 8 14:21:16 2016 -0400

----------------------------------------------------------------------
 .../parser/odf/OpenDocumentContentParser.java   |   5 +++++
 .../test-documents/testOpenOffice2.odt          | Bin 26448 -> 27554 bytes
 2 files changed, 5 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/87e1e23b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
index a32d406..a149dd8 100644
--- 
a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
+++ 
b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
@@ -68,6 +68,9 @@ public class OpenDocumentContentParser extends AbstractParser 
{
 
     private static final class OpenDocumentElementMappingContentHandler extends
             ElementMappingContentHandler {
+
+        private static final char[] SPACE = new char[]{ ' '};
+
         private final ContentHandler handler;
         private final BitSet textNodeStack = new BitSet();
         private int nodeDepth = 0;
@@ -283,6 +286,8 @@ public class OpenDocumentContentParser extends 
AbstractParser {
                     startList(attrs.getValue(TEXT_NS, "style-name"));
                 } else if (TEXT_NS.equals(namespaceURI) && 
"span".equals(localName)) {
                     startSpan(attrs.getValue(TEXT_NS, "style-name"));
+                } else if (TEXT_NS.equals(namespaceURI) && 
"s".equals(localName)) {
+                    handler.characters(SPACE, 0, 1);
                 } else {
                     super.startElement(namespaceURI, localName, qName, attrs);
                 }

http://git-wip-us.apache.org/repos/asf/tika/blob/87e1e23b/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt
----------------------------------------------------------------------
diff --git 
a/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt 
b/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt
index bc31925..f6c72b6 100644
Binary files 
a/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt and 
b/tika-test-resources/src/test/resources/test-documents/testOpenOffice2.odt 
differ

Reply via email to