Author: tallison
Date: Fri Sep 27 18:55:31 2013
New Revision: 1527030
URL: http://svn.apache.org/r1527030
Log:
TIKA-1171 -- extra asterisks from master slide in PPT; added tests to TIKA-712
test files to show 1171 was fixed. Borrowed extraction code from POI
PowerPointExtractor
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1527030&r1=1527029&r2=1527030&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
Fri Sep 27 18:55:31 2013
@@ -68,19 +68,13 @@ public class HSLFExtractor extends Abstr
}
// Slide master, if present
- // TODO: re-enable this once we fix TIKA-712
- MasterSheet master = slide.getMasterSheet();
- if(master != null) {
- xhtml.startElement("p", "class", "slide-master-content");
- textRunsToText(xhtml, master.getTextRuns(), true );
- xhtml.endElement("p");
- }
+ extractMaster(xhtml, slide.getMasterSheet());
// Slide text
{
xhtml.startElement("p", "class", "slide-content");
- textRunsToText(xhtml, slide.getTextRuns(), false );
+ textRunsToText(xhtml, slide.getTextRuns());
xhtml.endElement("p");
}
@@ -155,7 +149,7 @@ public class HSLFExtractor extends Abstr
}
// Notes text
- textRunsToText(xhtml, notes.getTextRuns(), false);
+ textRunsToText(xhtml, notes.getTextRuns());
// Repeat the notes footer, if set
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null)
{
@@ -170,6 +164,31 @@ public class HSLFExtractor extends Abstr
xhtml.endElement("div");
}
+ private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master)
throws SAXException {
+ if (master == null){
+ return;
+ }
+ Shape[] shapes = master.getShapes();
+ if (shapes == null || shapes.length == 0){
+ return;
+ }
+
+ xhtml.startElement("div", "class", "slide-master-content");
+ for (int i = 0; i < shapes.length; i++){
+ Shape sh = shapes[i];
+ if (sh != null && ! MasterSheet.isPlaceholder(sh)){
+ if (sh instanceof TextShape){
+ TextShape tsh = (TextShape)sh;
+ String text = tsh.getText();
+ if (text != null){
+ xhtml.element("p", text);
+ }
+ }
+ }
+ }
+ xhtml.endElement("div");
+ }
+
private void extractTableText(XHTMLContentHandler xhtml, Table shape)
throws SAXException {
xhtml.startElement("table");
for (int row = 0; row < shape.getNumberOfRows(); row++){
@@ -188,17 +207,20 @@ public class HSLFExtractor extends Abstr
xhtml.endElement("table");
}
- private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs,
boolean isMaster) throws SAXException {
+ private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs)
throws SAXException {
if (runs==null) {
return;
}
for (TextRun run : runs) {
if (run != null) {
+ // Leaving in wisdom from TIKA-712 for easy revert.
// Avoid boiler-plate text on the master slide (0
// = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
- if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
- xhtml.characters(run.getText());
+ //if (!isMaster || (run.getRunType() != 0 && run.getRunType() !=
1)) {
+ String txt = run.getText();
+ if (txt != null){
+ xhtml.characters(txt);
xhtml.startElement("br");
xhtml.endElement("br");
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1527030&r1=1527029&r2=1527030&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Fri Sep 27 18:55:31 2013
@@ -141,6 +141,9 @@ public class PowerPointParserTest extend
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
+
+ //TIKA-1171
+ assertEquals(-1, content.indexOf("*"));
}
// TODO: once we fix TIKA-712, re-enable this
@@ -161,6 +164,9 @@ public class PowerPointParserTest extend
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
+
+ //TIKA-1171
+ assertEquals(-1, content.indexOf("*"));
}
// TODO: once we fix TIKA-712, re-enable this
@@ -181,6 +187,8 @@ public class PowerPointParserTest extend
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
+ //TIKA-1171
+ assertEquals(-1, content.indexOf("*"));
}
/**