This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 5638ebc TIKA-2448: Extract phonetic runs in docx with experimental
SAX parser
new 70de289 Merge remote-tracking branch 'origin/master'
5638ebc is described below
commit 5638ebc0db0b88341ba663d5ba12eebbe2925240
Author: tballison <[email protected]>
AuthorDate: Wed Aug 30 12:37:44 2017 -0400
TIKA-2448: Extract phonetic runs in docx with experimental SAX parser
---
CHANGES.txt | 2 +
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 51 +++++++++++++++++----
.../ooxml/SXWPFWordExtractorDecorator.java | 2 +-
.../tika/parser/microsoft/WordParserTest.java | 10 ++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 17 +++++++
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 16 +++++++
.../resources/test-documents/testWORD_phonetic.doc | Bin 0 -> 27136 bytes
.../test-documents/testWORD_phonetic.docx | Bin 0 -> 12523 bytes
8 files changed, 87 insertions(+), 11 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 7e19fb5..2702f68 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.17 - ???
+ * Extract phonetic runs in docx with experimental SAX parser (TIKA-2448).
+
* Extract phonetic runs from xls and allow users to turn off extraction
of phonetic runs in both xls and xlsx (TIKA-2440).
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index c4afd00..f12da58 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -21,6 +21,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.util.Date;
import java.util.Map;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.utils.DateUtils;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@@ -85,6 +86,9 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private final static String OLE_OBJECT = "OLEObject";
private final static String CR = "cr";
private final static String V = "v";
+ private final static String RUBY = "ruby"; //phonetic section
+ private final static String RT = "rt"; //phonetic run
+
public final static String W_NS =
"http://schemas.openxmlformats.org/wordprocessingml/2006/main";
private final static String MC_NS =
"http://schemas.openxmlformats.org/markup-compatibility/2006";
@@ -115,10 +119,11 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private final Map<String, String> linkedRelationships;
- private boolean inR = false;//in run or in field
+ private boolean inR = false;//in run or in field. TODO: convert this to an
integer because you can have a run within a run
private boolean inT = false;
private boolean inRPr = false;
private boolean inNumPr = false;
+ private boolean inRt = false;
private boolean inPic = false;
private boolean inPict = false;
@@ -144,7 +149,9 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private final RunProperties currRunProperties = new RunProperties();
private final ParagraphProperties currPProperties = new
ParagraphProperties();
private final boolean includeTextBox;
+ private final boolean concatenatePhoneticRuns;
private final StringBuilder runBuffer = new StringBuilder();
+ private final StringBuilder rubyBuffer = new StringBuilder();//buffers rt
in ruby sections (see 17.3.3.25)
private boolean inDelText = false;
@@ -158,14 +165,16 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler
bodyContentsHandler,
Map<String, String> hyperlinks) {
- this(bodyContentsHandler, hyperlinks, true);
+ this(bodyContentsHandler, hyperlinks, true, true);
}
+
public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler
bodyContentsHandler,
- Map<String, String> hyperlinks,
boolean includeTextBox) {
+ Map<String, String> hyperlinks,
boolean includeTextBox, boolean concatenatePhoneticRuns) {
this.bodyContentsHandler = bodyContentsHandler;
this.linkedRelationships = hyperlinks;
this.includeTextBox = includeTextBox;
+ this.concatenatePhoneticRuns = concatenatePhoneticRuns;
}
@@ -328,6 +337,8 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
bodyContentsHandler.endnoteReference(id);
} else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a
chart
inV = true;
+ } else if (RT.equals(localName)) {
+ inRt = true;
}
}
@@ -417,6 +428,19 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
} else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a
chart
inV = false;
handleEndOfRun();
+ } else if (RT.equals(localName)) {
+ inRt = false;
+ } else if (RUBY.equals(localName)) {
+ handleEndOfRuby();
+ }
+ }
+
+ private void handleEndOfRuby() {
+ if (rubyBuffer.length() > 0) {
+ if (concatenatePhoneticRuns) {
+ bodyContentsHandler.run(currRunProperties, " (" +
rubyBuffer.toString() + ")");
+ }
+ rubyBuffer.setLength(0);
}
}
@@ -454,15 +478,15 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
if (editType.equals(EditType.MOVE_FROM) && inT) {
if (bodyContentsHandler.getIncludeMoveFromText()) {
- runBuffer.append(ch, start, length);
+ appendToBuffer(ch, start, length);
}
} else if (inT) {
- runBuffer.append(ch, start, length);
+ appendToBuffer(ch, start, length);
} else if (bodyContentsHandler.getIncludeDeletedText() &&
editType.equals(EditType.DELETE)) {
- runBuffer.append(ch, start, length);
+ appendToBuffer(ch, start, length);
} else if (inV) {
- runBuffer.append(ch, start, length);
- runBuffer.append(TAB_CHAR, 0, 1);
+ appendToBuffer(ch, start, length);
+ appendToBuffer(TAB_CHAR, 0, 1);
}
}
@@ -475,12 +499,19 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
}
if (inT) {
- runBuffer.append(ch, start, length);
+ appendToBuffer(ch, start, length);
} else if (bodyContentsHandler.getIncludeDeletedText() && inDelText) {
- runBuffer.append(ch, start, length);
+ appendToBuffer(ch, start, length);
}
}
+ private void appendToBuffer(char[] ch, int start, int length) throws
SAXException {
+ if (inRt) {
+ rubyBuffer.append(ch, start, length);
+ } else {
+ runBuffer.append(ch, start, length);
+ }
+ }
public interface XWPFBodyContentsHandler {
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 17b4e33..5c7352e 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -192,7 +192,7 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
new OfflineContentHandler(new EmbeddedContentHandler(
new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml,
styles, listManager,
- config), linkedRelationships,
config.getIncludeShapeBasedContent()))));
+ config), linkedRelationships,
config.getIncludeShapeBasedContent(), config.getConcatenatePhoneticRuns()))));
} catch (TikaException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index fb53c95..b399d09 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -611,5 +611,15 @@ public class WordParserTest extends TikaTest {
}
}
+
+ @Test
+ public void testPhonetic() throws Exception {
+ //TODO: fix this
+ //current behavior:
+ //EQ \\* jc2 \\* "Font:MS Mincho" \\* hps11 \\o\\ad(\\s\\up
10(とうきょう),東京)
+ //We need to parse that string and separate content + phonetic
+ assertContains("\\s\\up
10(\u3068\u3046\u304D\u3087\u3046),\u6771\u4EAC",
+ getXML("testWORD_phonetic.doc").xml);
+ }
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index abc05b2..55946b2 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -75,6 +75,7 @@ import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import ucar.nc2.util.xml.Parse;
public class OOXMLParserTest extends TikaTest {
@@ -1736,6 +1737,22 @@ public class OOXMLParserTest extends TikaTest {
}
+ @Test
+ @Ignore("to be fixed in > POI 3.17")
+ public void testDOCXPhoneticStrings() throws Exception {
+
+ assertContains("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)",
+ getXML("testWORD_phonetic.docx").xml);
+
+ OfficeParserConfig config = new OfficeParserConfig();
+ config.setConcatenatePhoneticRuns(false);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(OfficeParserConfig.class, config);
+ String xml = getXML("testWORD_phonetic.docx", parseContext).xml;
+ assertContains("\u6771\u4EAC", xml);
+ assertNotContained("\u3068", xml);
+ }
+
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index bc131f0..89bd754 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -822,4 +822,20 @@ public class SXWPFExtractorTest extends TikaTest {
assertNotContained("This is the footer text.", xml);
}
+ @Test
+ public void testDOCXPhoneticStrings() throws Exception {
+ OfficeParserConfig config = new OfficeParserConfig();
+ config.setUseSAXDocxExtractor(true);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(OfficeParserConfig.class, config);
+ assertContains("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)",
+ getXML("testWORD_phonetic.docx", parseContext).xml);
+
+
+ config.setConcatenatePhoneticRuns(false);
+ String xml = getXML("testWORD_phonetic.docx", parseContext).xml;
+ assertContains("\u6771\u4EAC", xml);
+ assertNotContained("\u3068", xml);
+ }
+
}
diff --git
a/tika-parsers/src/test/resources/test-documents/testWORD_phonetic.doc
b/tika-parsers/src/test/resources/test-documents/testWORD_phonetic.doc
new file mode 100644
index 0000000..e202033
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testWORD_phonetic.doc differ
diff --git
a/tika-parsers/src/test/resources/test-documents/testWORD_phonetic.docx
b/tika-parsers/src/test/resources/test-documents/testWORD_phonetic.docx
new file mode 100644
index 0000000..6fc1afe
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testWORD_phonetic.docx differ
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].