Author: tallison Date: Fri Sep 27 18:55:31 2013 New Revision: 1527030 URL: http://svn.apache.org/r1527030 Log: TIKA-1171 -- extra asterisks from master slide in PPT; added tests to TIKA-712 test files to show 1171 was fixed. Borrowed extraction code from POI PowerPointExtractor
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1527030&r1=1527029&r2=1527030&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Fri Sep 27 18:55:31 2013 @@ -68,19 +68,13 @@ public class HSLFExtractor extends Abstr } // Slide master, if present - // TODO: re-enable this once we fix TIKA-712 - MasterSheet master = slide.getMasterSheet(); - if(master != null) { - xhtml.startElement("p", "class", "slide-master-content"); - textRunsToText(xhtml, master.getTextRuns(), true ); - xhtml.endElement("p"); - } + extractMaster(xhtml, slide.getMasterSheet()); // Slide text { xhtml.startElement("p", "class", "slide-content"); - textRunsToText(xhtml, slide.getTextRuns(), false ); + textRunsToText(xhtml, slide.getTextRuns()); xhtml.endElement("p"); } @@ -155,7 +149,7 @@ public class HSLFExtractor extends Abstr } // Notes text - textRunsToText(xhtml, notes.getTextRuns(), false); + textRunsToText(xhtml, notes.getTextRuns()); // Repeat the notes footer, if set if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) { @@ -170,6 +164,31 @@ public class HSLFExtractor extends Abstr xhtml.endElement("div"); } + private void extractMaster(XHTMLContentHandler xhtml, MasterSheet master) throws SAXException { + if (master == null){ + return; + } + Shape[] shapes = master.getShapes(); + if (shapes == null || shapes.length == 0){ + return; + } + + xhtml.startElement("div", "class", "slide-master-content"); + for (int i = 0; i < shapes.length; i++){ + Shape sh = shapes[i]; + if (sh != null && ! MasterSheet.isPlaceholder(sh)){ + if (sh instanceof TextShape){ + TextShape tsh = (TextShape)sh; + String text = tsh.getText(); + if (text != null){ + xhtml.element("p", text); + } + } + } + } + xhtml.endElement("div"); + } + private void extractTableText(XHTMLContentHandler xhtml, Table shape) throws SAXException { xhtml.startElement("table"); for (int row = 0; row < shape.getNumberOfRows(); row++){ @@ -188,17 +207,20 @@ public class HSLFExtractor extends Abstr xhtml.endElement("table"); } - private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs, boolean isMaster) throws SAXException { + private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs) throws SAXException { if (runs==null) { return; } for (TextRun run : runs) { if (run != null) { + // Leaving in wisdom from TIKA-712 for easy revert. // Avoid boiler-plate text on the master slide (0 // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE): - if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) { - xhtml.characters(run.getText()); + //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) { + String txt = run.getText(); + if (txt != null){ + xhtml.characters(txt); xhtml.startElement("br"); xhtml.endElement("br"); } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1527030&r1=1527029&r2=1527030&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Fri Sep 27 18:55:31 2013 @@ -141,6 +141,9 @@ public class PowerPointParserTest extend // Make sure boilerplate text didn't come through: assertEquals(-1, content.indexOf("Click to edit Master")); + + //TIKA-1171 + assertEquals(-1, content.indexOf("*")); } // TODO: once we fix TIKA-712, re-enable this @@ -161,6 +164,9 @@ public class PowerPointParserTest extend // Make sure boilerplate text didn't come through: assertEquals(-1, content.indexOf("Click to edit Master")); + + //TIKA-1171 + assertEquals(-1, content.indexOf("*")); } // TODO: once we fix TIKA-712, re-enable this @@ -181,6 +187,8 @@ public class PowerPointParserTest extend // Make sure boilerplate text didn't come through: assertEquals(-1, content.indexOf("Click to edit Master")); + //TIKA-1171 + assertEquals(-1, content.indexOf("*")); } /**