Author: nick
Date: Tue Jul  2 15:11:35 2013
New Revision: 1498968

URL: http://svn.apache.org/r1498968
Log:
Patch from Tim Allison from TIKA-1130 - Extract from .docx SDT runs as well

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1498968&r1=1498967&r2=1498968&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 Tue Jul  2 15:11:35 2013
@@ -28,6 +28,7 @@ import org.apache.poi.xwpf.model.XWPFHea
 import org.apache.poi.xwpf.usermodel.BodyType;
 import org.apache.poi.xwpf.usermodel.IBody;
 import org.apache.poi.xwpf.usermodel.IBodyElement;
+import org.apache.poi.xwpf.usermodel.IRunElement;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
 import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
@@ -36,6 +37,8 @@ import org.apache.poi.xwpf.usermodel.XWP
 import org.apache.poi.xwpf.usermodel.XWPFPicture;
 import org.apache.poi.xwpf.usermodel.XWPFPictureData;
 import org.apache.poi.xwpf.usermodel.XWPFRun;
+import org.apache.poi.xwpf.usermodel.XWPFSDT;
+import org.apache.poi.xwpf.usermodel.XWPFSDTContent;
 import org.apache.poi.xwpf.usermodel.XWPFStyle;
 import org.apache.poi.xwpf.usermodel.XWPFStyles;
 import org.apache.poi.xwpf.usermodel.XWPFTable;
@@ -99,9 +102,22 @@ public class XWPFWordExtractorDecorator 
              XWPFTable table = (XWPFTable)element;
              extractTable(table, xhtml);
           }
+          if (element instanceof XWPFSDT){
+             extractSDT((XWPFSDT) element, xhtml);
+          }
+
       }
     }
     
+    private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws 
SAXException, 
+    XmlException, IOException {
+       XWPFSDTContent content = element.getContent();
+       String tag = "p";
+       xhtml.startElement(tag);
+       xhtml.characters(content.getText());
+       xhtml.endElement(tag);
+    }
+    
     private void extractParagraph(XWPFParagraph paragraph, XHTMLContentHandler 
xhtml)
             throws SAXException, XmlException, IOException {
        // If this paragraph is actually a whole new section, then
@@ -183,79 +199,23 @@ public class XWPFWordExtractorDecorator 
           xhtml.endElement("a");
        }
        
-       // True if we are currently in the named style tag:
-       boolean curBold = false;
-       boolean curItalic = false;
-
-       // Do the text
-       for(XWPFRun run : paragraph.getRuns()) {
-          if (run.isBold() != curBold) {
-            if (curItalic) {
-              xhtml.endElement("i");
-              curItalic = false;
-            }
-            if (run.isBold()) {
-              xhtml.startElement("b");
-            } else {
-              xhtml.endElement("b");
-            }
-            curBold = run.isBold();
-          }
-
-          if (run.isItalic() != curItalic) {
-            if (run.isItalic()) {
-              xhtml.startElement("i");
-            } else {
-              xhtml.endElement("i");
-            }
-            curItalic = run.isItalic();
-          }
-
-          boolean addedHREF = false;
-          if(run instanceof XWPFHyperlinkRun) {
-             XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun)run;
-             XWPFHyperlink link = linkRun.getHyperlink(document);
-             if(link != null && link.getURL() != null) {
-                xhtml.startElement("a", "href", link.getURL());
-                addedHREF = true;
-             } else if(linkRun.getAnchor() != null && 
linkRun.getAnchor().length() > 0) {
-                xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
-                addedHREF = true;
-             }
-          }
-
-          xhtml.characters(run.toString());
-          
-          // If we have any pictures, output them
-          for(XWPFPicture picture : run.getEmbeddedPictures()) {
-             if(paragraph.getDocument() != null) {
-                XWPFPictureData data = picture.getPictureData();
-                if(data != null) {
-                   AttributesImpl attr = new AttributesImpl();
-
-                   attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
data.getFileName());
-                   attr.addAttribute("", "alt", "alt", "CDATA", 
picture.getDescription());
-
-                   xhtml.startElement("img", attr);
-                   xhtml.endElement("img");
-                }
-             }
-          }
-
-          if (addedHREF) {
-            xhtml.endElement("a");
+       TmpFormatting fmtg = new TmpFormatting(false, false);
+       
+       // Do the iruns
+       for(IRunElement run : paragraph.getIRuns()) {
+          if (run instanceof XWPFSDT){
+             fmtg = closeStyleTags(xhtml, fmtg);
+             processSDTRun((XWPFSDT)run, xhtml);
+             //for now, we're ignoring formatting in sdt
+             //if you hit an sdt reset to false
+             fmtg.setBold(false);
+             fmtg.setItalic(false);
+          } else {
+             fmtg = processRun((XWPFRun)run, paragraph, xhtml, fmtg);
           }
        }
+       closeStyleTags(xhtml, fmtg);
        
-       // Close any still open style tags
-       if (curItalic) {
-         xhtml.endElement("i");
-         curItalic = false;
-       }
-       if (curBold) {
-         xhtml.endElement("b");
-         curBold = false;
-       }
        
        // Now do any comments for the paragraph
        XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, 
null);
@@ -282,6 +242,89 @@ public class XWPFWordExtractorDecorator 
        }
     }
 
+    private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml,
+          TmpFormatting fmtg) throws SAXException {
+       // Close any still open style tags
+       if (fmtg.isItalic()) {
+          xhtml.endElement("i");
+          fmtg.setItalic(false);
+       }
+       if (fmtg.isBold()) {
+          xhtml.endElement("b");
+          fmtg.setBold(false);
+       }
+       return fmtg;
+    }
+
+    private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph, 
+          XHTMLContentHandler xhtml, TmpFormatting tfmtg) 
+          throws SAXException, XmlException, IOException{
+       // True if we are currently in the named style tag:
+       if (run.isBold() != tfmtg.isBold()) {
+          if (tfmtg.isItalic()) {
+             xhtml.endElement("i");
+             tfmtg.setItalic(false);
+          }
+          if (run.isBold()) {
+             xhtml.startElement("b");
+          } else {
+             xhtml.endElement("b");
+          }
+          tfmtg.setBold(run.isBold());
+       }
+
+       if (run.isItalic() != tfmtg.isItalic()) {
+          if (run.isItalic()) {
+             xhtml.startElement("i");
+          } else {
+             xhtml.endElement("i");
+          }
+          tfmtg.setItalic(run.isItalic());
+       }
+
+       boolean addedHREF = false;
+       if(run instanceof XWPFHyperlinkRun) {
+          XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun)run;
+          XWPFHyperlink link = linkRun.getHyperlink(document);
+          if(link != null && link.getURL() != null) {
+             xhtml.startElement("a", "href", link.getURL());
+             addedHREF = true;
+          } else if(linkRun.getAnchor() != null && 
linkRun.getAnchor().length() > 0) {
+             xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
+             addedHREF = true;
+          }
+       }
+
+       xhtml.characters(run.toString());
+
+       // If we have any pictures, output them
+       for(XWPFPicture picture : run.getEmbeddedPictures()) {
+          if(paragraph.getDocument() != null) {
+             XWPFPictureData data = picture.getPictureData();
+             if(data != null) {
+                AttributesImpl attr = new AttributesImpl();
+
+                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
data.getFileName());
+                attr.addAttribute("", "alt", "alt", "CDATA", 
picture.getDescription());
+
+                xhtml.startElement("img", attr);
+                xhtml.endElement("img");
+             }
+          }
+       }
+
+       if (addedHREF) {
+          xhtml.endElement("a");
+       }
+
+       return tfmtg;
+    }
+
+    private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
+          throws SAXException, XmlException, IOException{
+       xhtml.characters(run.getContent().getText());
+    }
+
     private void extractTable(XWPFTable table, XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
        xhtml.startElement("table");
@@ -333,12 +376,15 @@ public class XWPFWordExtractorDecorator 
     }
 
     private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter 
header) throws SAXException, XmlException, IOException {
-        for(XWPFParagraph p : header.getParagraphs()) {
-            extractParagraph(p, xhtml);
-        }
 
-        for(XWPFTable table : header.getTables()) {
-            extractTable(table, xhtml);
+        for (IBodyElement e : header.getBodyElements()){
+           if (e instanceof XWPFParagraph){
+              extractParagraph((XWPFParagraph)e, xhtml);
+           } else if (e instanceof XWPFTable){
+              extractTable((XWPFTable)e, xhtml);
+           } else if (e instanceof XWPFSDT){
+              extractSDT((XWPFSDT)e, xhtml);
+           }
         }
     }
 
@@ -352,4 +398,27 @@ public class XWPFWordExtractorDecorator 
        parts.add( document.getPackagePart() );
        return parts;
     }
+    
+    private class TmpFormatting{
+       private boolean bold = false;
+       private boolean italic = false;
+       private TmpFormatting(boolean bold, boolean italic){
+          this.bold = bold;
+          this.italic = italic;
+       }
+       public boolean isBold() {
+          return bold;
+       }
+       public void setBold(boolean bold) {
+          this.bold = bold;
+       }
+       public boolean isItalic() {
+          return italic;
+       }
+       public void setItalic(boolean italic) {
+          this.italic = italic;
+       }
+       
+    }
+
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1498968&r1=1498967&r2=1498968&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Tue Jul  2 15:11:35 2013
@@ -970,10 +970,8 @@ public class OOXMLParserTest extends Tik
     /**
      * Test for missing text described in 
      * <a href="https://issues.apache.org/jira/browse/TIKA-1130";>TIKA-1130</a>.
-     * 
-     * @throws Exception
      */
-    public void disabledTestMissingText() throws Exception { // TODO: Enable 
test once POI has been updated. 
+    public void testMissingText() throws Exception {
         Metadata metadata = new Metadata();
         ContentHandler handler = new BodyContentHandler();
         ParseContext context = new ParseContext();
@@ -990,4 +988,4 @@ public class OOXMLParserTest extends Tik
             input.close();
         }
     }
-  }
+}


Reply via email to