Author: mikemccand
Date: Sat Dec 1 18:00:49 2012
New Revision: 1416030
URL: http://svn.apache.org/viewvc?rev=1416030&view=rev
Log:
TIKA-712: extract master text, except for title/body
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1416030&r1=1416029&r2=1416030&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Dec 1 18:00:49 2012
@@ -42,7 +42,8 @@ Release 1.3 - Current Development
* MS PowerPoint (.ppt): When a PowerPoint (.ppt) document contains
embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
XHTML so you can see where in the main text the embedded document
- occurred (TIKA-1025).
+ occurred (TIKA-1025). Text from the master slide is now extracted
+ (TIKA-712).
* MHTML: fixed Null charset name exception when a mime part has an
unrecognized charset (TIKA-1011).
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1416030&r1=1416029&r2=1416030&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
Sat Dec 1 18:00:49 2012
@@ -69,20 +69,18 @@ public class HSLFExtractor extends Abstr
// Slide master, if present
// TODO: re-enable this once we fix TIKA-712
- /*
MasterSheet master = slide.getMasterSheet();
if(master != null) {
xhtml.startElement("p", "class", "slide-master-content");
- textRunsToText(xhtml, master.getTextRuns() );
+ textRunsToText(xhtml, master.getTextRuns(), true );
xhtml.endElement("p");
}
- */
// Slide text
{
xhtml.startElement("p", "class", "slide-content");
- textRunsToText(xhtml, slide.getTextRuns() );
+ textRunsToText(xhtml, slide.getTextRuns(), false );
xhtml.endElement("p");
}
@@ -150,7 +148,7 @@ public class HSLFExtractor extends Abstr
}
// Notes text
- textRunsToText(xhtml, notes.getTextRuns());
+ textRunsToText(xhtml, notes.getTextRuns(), false);
// Repeat the notes footer, if set
if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null)
{
@@ -165,16 +163,20 @@ public class HSLFExtractor extends Abstr
xhtml.endElement("div");
}
- private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs)
throws SAXException {
+ private void textRunsToText(XHTMLContentHandler xhtml, TextRun[] runs,
boolean isMaster) throws SAXException {
if (runs==null) {
return;
}
for (TextRun run : runs) {
if (run != null) {
- xhtml.characters( run.getText() );
- xhtml.startElement("br");
- xhtml.endElement("br");
+ // Avoid boiler-plate text on the master slide (0
+ // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+ if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
+ xhtml.characters(run.getText());
+ xhtml.startElement("br");
+ xhtml.endElement("br");
+ }
}
}
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1416030&r1=1416029&r2=1416030&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Sat Dec 1 18:00:49 2012
@@ -143,7 +143,6 @@ public class PowerPointParserTest extend
}
// TODO: once we fix TIKA-712, re-enable this
- /*
public void testMasterText() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
@@ -162,10 +161,8 @@ public class PowerPointParserTest extend
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
}
- */
// TODO: once we fix TIKA-712, re-enable this
- /*
public void testMasterText2() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
@@ -184,7 +181,6 @@ public class PowerPointParserTest extend
// Make sure boilerplate text didn't come through:
assertEquals(-1, content.indexOf("Click to edit Master"));
}
- */
/**
* Ensures that custom OLE2 (HPSF) properties are extracted