Author: tallison
Date: Thu Oct 22 14:10:11 2015
New Revision: 1710023
URL: http://svn.apache.org/viewvc?rev=1710023&view=rev
Log:
TIKA-1777 fix regression in ppt spacing, patch from Andreas Beeker
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1710023&r1=1710022&r2=1710023&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Oct 22 14:10:11 2015
@@ -1,5 +1,7 @@
Release 1.11 - 10/18/2015
+ * Fix regression with spacing in PPT via Andreas Beeker (TIKA-1777).
+
* Java7 API support for allowing java.nio.file.Path as method arguments
was added to Tika and to ParsingReader, TikaFileTypeDetector, and to
Tika Config (TIKA-1745, TIKA-1746, TIKA-1751).
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1710023&r1=1710022&r2=1710023&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
Thu Oct 22 14:10:11 2015
@@ -238,30 +238,58 @@ public class HSLFExtractor extends Abstr
// = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
//if (!isMaster || (run.getRunType() != 0 && run.getRunType() !=
1)) {
- for (HSLFTextParagraph htp : run) {
- xhtml.startElement("p");
-
- for (HSLFTextRun htr : htp.getTextRuns()) {
- String line = htr.getRawText();
- if (line != null) {
- boolean isfirst = true;
- for (String fragment :
line.split("\\u000b")){
- if (!isfirst) {
- xhtml.startElement("br");
- xhtml.endElement("br");
- }
- isfirst = false;
-
xhtml.characters(fragment.trim());
- }
- }
- }
- xhtml.endElement("p");
-
+ boolean isBullet = false;
+ for (HSLFTextParagraph htp : run) {
+ boolean nextBullet = htp.isBullet();
+ // TODO: identify bullet/list type
+ if (isBullet != nextBullet) {
+ isBullet = nextBullet;
+ if (isBullet) {
+ xhtml.startElement("ul");
+ } else {
+ xhtml.endElement("ul");
+ }
+ }
+
+ List<HSLFTextRun> textRuns = htp.getTextRuns();
+ String firstLine = removePBreak(textRuns.get(0).getRawText());
+ boolean showBullet = (isBullet && (textRuns.size() > 1 ||
!"".equals(firstLine)));
+ String paraTag = showBullet ? "li" : "p";
+
+ xhtml.startElement(paraTag);
+ for (HSLFTextRun htr : textRuns) {
+ String line = htr.getRawText();
+ if (line != null) {
+ boolean isfirst = true;
+ for (String fragment : line.split("\\u000b")) {
+ if (!isfirst) {
+ xhtml.startElement("br");
+ xhtml.endElement("br");
+ }
+ isfirst = false;
+ xhtml.characters(removePBreak(fragment));
+ }
+ if (line.endsWith("\u000b")) {
+ xhtml.startElement("br");
+ xhtml.endElement("br");
+ }
+ }
+ }
+ xhtml.endElement(paraTag);
+ }
+ if (isBullet) {
+ xhtml.endElement("ul");
}
-
}
}
+ // remove trailing paragraph break
+ private static String removePBreak(String fragment) {
+ // the last text run of a text paragraph contains the paragraph break
(\r)
+ // line breaks (\\u000b) can happen more often
+ return fragment.replaceFirst("\\r$", "");
+ }
+
private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow,
XHTMLContentHandler xhtml)
throws TikaException, SAXException, IOException {
for (HSLFPictureData pic : slideshow.getPictureData()) {
@@ -278,8 +306,8 @@ public class HSLFExtractor extends Abstr
mediaType = "image/bmp";
break;
default:
- mediaType = pic.getContentType();
- break;
+ mediaType = pic.getContentType();
+ break;
}
handleEmbeddedResource(
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1710023&r1=1710022&r2=1710023&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Thu Oct 22 14:10:11 2015
@@ -58,12 +58,12 @@ public class PowerPointParserTest extend
Metadata metadata = new Metadata();
String xml = getXML("testPPT_various.ppt", metadata).xml;
assertContains("<p>Footnote appears here", xml);
- assertContains("<p>[1]This is a footnote.", xml);
+ assertContains("<p>[1] This is a footnote.", xml);
assertContains("<p>This is the header text.</p>", xml);
assertContains("<p>This is the footer text.</p>", xml);
assertContains("<p>Here is a text box</p>", xml);
- //TODO: fix this spacing: assertContains("<p>Bold ", xml);
- assertContains("italic", xml);
+ assertContains("<p>Bold ", xml);
+ assertContains("italic underline superscript subscript", xml);
assertContains("underline", xml);
assertContains("superscript", xml);
assertContains("subscript", xml);
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1710023&r1=1710022&r2=1710023&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Thu Oct 22 14:10:11 2015
@@ -629,10 +629,7 @@ public class OOXMLParserTest extends Tik
assertContains("<p>This is the footer text.</p>", xml);
assertContains("<p>Here is a text box</p>", xml);
assertContains("<p>Bold", xml);
- assertContains("italic", xml);
- assertContains("underline", xml);
- assertContains("superscript", xml);
- assertContains("subscript", xml);
+ assertContains("italic underline superscript subscript", xml);
assertContains("<p>Here is a citation:", xml);
assertContains("Figure 1 This is a caption for Figure 1", xml);
assertContains("(Kramer)", xml);