Author: tallison
Date: Thu Oct 22 14:10:11 2015
New Revision: 1710023

URL: http://svn.apache.org/viewvc?rev=1710023&view=rev
Log:
TIKA-1777 fix regression in ppt spacing, patch from Andreas Beeker

Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1710023&r1=1710022&r2=1710023&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Oct 22 14:10:11 2015
@@ -1,5 +1,7 @@
 Release 1.11 - 10/18/2015
 
+  * Fix regression with spacing in PPT via Andreas Beeker (TIKA-1777).
+
   * Java7 API support for allowing java.nio.file.Path as method arguments
     was added to Tika and to ParsingReader, TikaFileTypeDetector, and to
     Tika Config (TIKA-1745, TIKA-1746, TIKA-1751).

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1710023&r1=1710022&r2=1710023&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
 Thu Oct 22 14:10:11 2015
@@ -238,30 +238,58 @@ public class HSLFExtractor extends Abstr
             // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
             //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 
1)) {
 
-               for (HSLFTextParagraph htp : run) {
-                       xhtml.startElement("p");
-
-                       for (HSLFTextRun htr : htp.getTextRuns()) {
-                               String line = htr.getRawText();
-                               if (line != null) {
-                                       boolean isfirst = true;
-                                       for (String fragment : 
line.split("\\u000b")){
-                                               if (!isfirst)  {
-                                   xhtml.startElement("br");
-                                   xhtml.endElement("br");
-                                               }
-                                               isfirst = false;
-                                               
xhtml.characters(fragment.trim());
-                                       }
-                               }
-                       }
-                xhtml.endElement("p");
-
+            boolean isBullet = false;
+            for (HSLFTextParagraph htp : run) {
+                boolean nextBullet = htp.isBullet();
+                // TODO: identify bullet/list type
+                if (isBullet != nextBullet) {
+                    isBullet = nextBullet;
+                    if (isBullet) {
+                        xhtml.startElement("ul");
+                    } else {
+                        xhtml.endElement("ul");
+                    }
+                }
+
+                List<HSLFTextRun> textRuns = htp.getTextRuns();
+                String firstLine = removePBreak(textRuns.get(0).getRawText());
+                boolean showBullet = (isBullet && (textRuns.size() > 1 || 
!"".equals(firstLine)));
+                String paraTag = showBullet ? "li" : "p";
+
+                xhtml.startElement(paraTag);
+                for (HSLFTextRun htr : textRuns) {
+                    String line = htr.getRawText();
+                    if (line != null) {
+                        boolean isfirst = true;
+                        for (String fragment : line.split("\\u000b")) {
+                            if (!isfirst) {
+                                xhtml.startElement("br");
+                                xhtml.endElement("br");
+                            }
+                            isfirst = false;
+                            xhtml.characters(removePBreak(fragment));
+                        }
+                        if (line.endsWith("\u000b")) {
+                            xhtml.startElement("br");
+                            xhtml.endElement("br");
+                        }
+                    }
+                }
+                xhtml.endElement(paraTag);
+            }
+            if (isBullet) {
+                xhtml.endElement("ul");
             }
-               
         }
     }
 
+    // remove trailing paragraph break
+    private static String removePBreak(String fragment) {
+        // the last text run of a text paragraph contains the paragraph break 
(\r)
+        // line breaks (\\u000b) can happen more often
+        return fragment.replaceFirst("\\r$", "");
+    }
+
     private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, 
XHTMLContentHandler xhtml)
             throws TikaException, SAXException, IOException {
         for (HSLFPictureData pic : slideshow.getPictureData()) {
@@ -278,8 +306,8 @@ public class HSLFExtractor extends Abstr
                     mediaType = "image/bmp";
                     break;
                 default:
-                       mediaType = pic.getContentType();
-                       break;
+                    mediaType = pic.getContentType();
+                    break;
             }
 
             handleEmbeddedResource(

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1710023&r1=1710022&r2=1710023&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 Thu Oct 22 14:10:11 2015
@@ -58,12 +58,12 @@ public class PowerPointParserTest extend
         Metadata metadata = new Metadata();
         String xml = getXML("testPPT_various.ppt", metadata).xml;
         assertContains("<p>Footnote appears here", xml);
-        assertContains("<p>[1]This is a footnote.", xml);
+        assertContains("<p>[1] This is a footnote.", xml);
         assertContains("<p>This is the header text.</p>", xml);
         assertContains("<p>This is the footer text.</p>", xml);
         assertContains("<p>Here is a text box</p>", xml);
-        //TODO: fix this spacing: assertContains("<p>Bold ", xml);
-        assertContains("italic", xml);
+        assertContains("<p>Bold ", xml);
+        assertContains("italic underline superscript subscript", xml);
         assertContains("underline", xml);
         assertContains("superscript", xml);
         assertContains("subscript", xml);

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1710023&r1=1710022&r2=1710023&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Thu Oct 22 14:10:11 2015
@@ -629,10 +629,7 @@ public class OOXMLParserTest extends Tik
         assertContains("<p>This is the footer text.</p>", xml);
         assertContains("<p>Here is a text box</p>", xml);
         assertContains("<p>Bold", xml);
-        assertContains("italic", xml);
-        assertContains("underline", xml);
-        assertContains("superscript", xml);
-        assertContains("subscript", xml);
+        assertContains("italic underline superscript subscript", xml);
         assertContains("<p>Here is a citation:", xml);
         assertContains("Figure 1 This is a caption for Figure 1", xml);
         assertContains("(Kramer)", xml);


Reply via email to