a...

tallison Fri, 29 May 2015 07:37:37 -0700

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
 Fri May 29 14:36:21 2015
@@ -59,10 +59,18 @@ public class WordExtractor extends Abstr
     private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
     private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
     // could be improved by using the real delimiter in xchFollow [MS-DOC], 
v20140721, 2.4.6.3, Part 3, Step 3
-    private static final String LIST_DELIMITER = " "; 
+    private static final String LIST_DELIMITER = " ";
+    private static final Map<String, TagAndStyle> fixedParagraphStyles = new 
HashMap<String, TagAndStyle>();
+    private static final TagAndStyle defaultParagraphStyle = new 
TagAndStyle("p", null);
 
-    public WordExtractor(ParseContext context) {
-        super(context);
+    static {
+        fixedParagraphStyles.put("Default", defaultParagraphStyle);
+        fixedParagraphStyles.put("Normal", defaultParagraphStyle);
+        fixedParagraphStyles.put("heading", new TagAndStyle("h1", null));
+        fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null));
+        fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title"));
+        fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", 
"subtitle"));
+        fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", 
null));
     }
 
     // True if we are currently in the named style tag:
@@ -70,6 +78,57 @@ public class WordExtractor extends Abstr
     private boolean curBold;
     private boolean curItalic;
 
+    public WordExtractor(ParseContext context) {
+        super(context);
+    }
+
+    private static int countParagraphs(Range... ranges) {
+        int count = 0;
+        for (Range r : ranges) {
+            if (r != null) {
+                count += r.numParagraphs();
+            }
+        }
+        return count;
+    }
+
+    /**
+     * Given a style name, return what tag should be used, and
+     * what style should be applied to it.
+     */
+    public static TagAndStyle buildParagraphTagAndStyle(String styleName, 
boolean isTable) {
+        TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName);
+        if (tagAndStyle != null) {
+            return tagAndStyle;
+        }
+
+        if (styleName.equals("Table Contents") && isTable) {
+            return defaultParagraphStyle;
+        }
+
+        String tag = "p";
+        String styleClass = null;
+
+        if (styleName.startsWith("heading") || 
styleName.startsWith("Heading")) {
+            // "Heading 3" or "Heading2" or "heading 4"
+            int num = 1;
+            try {
+                num = Integer.parseInt(
+                        styleName.substring(styleName.length() - 1)
+                );
+            } catch (NumberFormatException e) {
+            }
+            // Turn it into a H1 - H6 (H7+ isn't valid!)
+            tag = "h" + Math.min(num, 6);
+        } else {
+            styleClass = styleName.replace(' ', '_');
+            styleClass = styleClass.substring(0, 1).toLowerCase(Locale.ROOT) +
+                    styleClass.substring(1);
+        }
+
+        return new TagAndStyle(tag, styleClass);
+    }
+
     protected void parse(
             NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
@@ -82,12 +141,12 @@ public class WordExtractor extends Abstr
         HWPFDocument document;
         try {
             document = new HWPFDocument(root);
-        } catch(OldWordFileFormatException e) {
+        } catch (OldWordFileFormatException e) {
             parseWord6(root, xhtml);
             return;
         }
         org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
-            new org.apache.poi.hwpf.extractor.WordExtractor(document);
+                new org.apache.poi.hwpf.extractor.WordExtractor(document);
         HeaderStories headerFooter = new HeaderStories(document);
 
         // Grab the list of pictures. As far as we can tell,
@@ -97,24 +156,24 @@ public class WordExtractor extends Abstr
         PicturesSource pictures = new PicturesSource(document);
 
         // Do any headers, if present
-        Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(),
-                headerFooter.getEvenHeaderSubrange(), 
headerFooter.getOddHeaderSubrange() };
+        Range[] headers = new Range[]{headerFooter.getFirstHeaderSubrange(),
+                headerFooter.getEvenHeaderSubrange(), 
headerFooter.getOddHeaderSubrange()};
         handleHeaderFooter(headers, "header", document, pictures, 
pictureTable, xhtml);
 
         // Do the main paragraph text
         Range r = document.getRange();
         ListManager listManager = new ListManager(document);
-        for(int i=0; i<r.numParagraphs(); i++) {
-           Paragraph p = r.getParagraph(i);
-           i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, 
pictures, pictureTable, listManager, xhtml);
+        for (int i = 0; i < r.numParagraphs(); i++) {
+            Paragraph p = r.getParagraph(i);
+            i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, 
pictures, pictureTable, listManager, xhtml);
         }
 
         // Do everything else
-        for (String paragraph: wordExtractor.getMainTextboxText()) {
+        for (String paragraph : wordExtractor.getMainTextboxText()) {
             xhtml.element("p", paragraph);
         }
 
-       for (String paragraph : wordExtractor.getFootnoteText()) {
+        for (String paragraph : wordExtractor.getFootnoteText()) {
             xhtml.element("p", paragraph);
         }
 
@@ -127,16 +186,16 @@ public class WordExtractor extends Abstr
         }
 
         // Do any footers, if present
-        Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(),
-                headerFooter.getEvenFooterSubrange(), 
headerFooter.getOddFooterSubrange() };
+        Range[] footers = new Range[]{headerFooter.getFirstFooterSubrange(),
+                headerFooter.getEvenFooterSubrange(), 
headerFooter.getOddFooterSubrange()};
         handleHeaderFooter(footers, "footer", document, pictures, 
pictureTable, xhtml);
 
         // Handle any pictures that we haven't output yet
-        for(Picture p = pictures.nextUnclaimed(); p != null; ) {
-           handlePictureCharacterRun(
-                 null, p, pictures, xhtml
-           );
-           p = pictures.nextUnclaimed();
+        for (Picture p = pictures.nextUnclaimed(); p != null; ) {
+            handlePictureCharacterRun(
+                    null, p, pictures, xhtml
+            );
+            p = pictures.nextUnclaimed();
         }
 
         // Handle any embeded office documents
@@ -148,32 +207,24 @@ public class WordExtractor extends Abstr
                     handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
                 }
             }
-        } catch(FileNotFoundException e) {
-        }
-    }
-
-    private static int countParagraphs(Range... ranges) {
-        int count = 0;
-        for (Range r : ranges) {
-            if (r != null) { count += r.numParagraphs(); }
+        } catch (FileNotFoundException e) {
         }
-        return count;
     }
 
     private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument 
document,
-          PicturesSource pictures, PicturesTable pictureTable, 
XHTMLContentHandler xhtml)
-          throws SAXException, IOException, TikaException {
+                                    PicturesSource pictures, PicturesTable 
pictureTable, XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
         if (countParagraphs(ranges) > 0) {
             xhtml.startElement("div", "class", type);
             ListManager listManager = new ListManager(document);
             for (Range r : ranges) {
                 if (r != null) {
-                    for(int i=0; i<r.numParagraphs(); i++) {
+                    for (int i = 0; i < r.numParagraphs(); i++) {
                         Paragraph p = r.getParagraph(i);
 
                         i += handleParagraph(p, 0, r, document,
                                 FieldsDocumentPart.HEADER, pictures, 
pictureTable, listManager, xhtml);
-                     }
+                    }
                 }
             }
             xhtml.endElement("div");
@@ -181,275 +232,276 @@ public class WordExtractor extends Abstr
     }
 
     private int handleParagraph(Paragraph p, int parentTableLevel, Range r, 
HWPFDocument document,
-          FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable 
pictureTable, ListManager listManager,
-          XHTMLContentHandler xhtml) throws SAXException, IOException, 
TikaException {
-       // Note - a poi bug means we can't currently properly recurse
-       //  into nested tables, so currently we don't
-       if(p.isInTable() && p.getTableLevel() > parentTableLevel && 
parentTableLevel==0) {
-          Table t = r.getTable(p);
-          xhtml.startElement("table");
-          xhtml.startElement("tbody");
-          for(int rn=0; rn<t.numRows(); rn++) {
-             TableRow row = t.getRow(rn);
-             xhtml.startElement("tr");
-             for(int cn=0; cn<row.numCells(); cn++) {
-                TableCell cell = row.getCell(cn);
-                xhtml.startElement("td");
-
-                for(int pn=0; pn<cell.numParagraphs(); pn++) {
-                   Paragraph cellP = cell.getParagraph(pn);
-                   handleParagraph(cellP, p.getTableLevel(), cell, document, 
docPart, pictures, pictureTable, listManager, xhtml);
-                }
-                xhtml.endElement("td");
-             }
-             xhtml.endElement("tr");
-          }
-          xhtml.endElement("tbody");
-          xhtml.endElement("table");
-          return (t.numParagraphs()-1);
-       }
+                                FieldsDocumentPart docPart, PicturesSource 
pictures, PicturesTable pictureTable, ListManager listManager,
+                                XHTMLContentHandler xhtml) throws 
SAXException, IOException, TikaException {
+        // Note - a poi bug means we can't currently properly recurse
+        //  into nested tables, so currently we don't
+        if (p.isInTable() && p.getTableLevel() > parentTableLevel && 
parentTableLevel == 0) {
+            Table t = r.getTable(p);
+            xhtml.startElement("table");
+            xhtml.startElement("tbody");
+            for (int rn = 0; rn < t.numRows(); rn++) {
+                TableRow row = t.getRow(rn);
+                xhtml.startElement("tr");
+                for (int cn = 0; cn < row.numCells(); cn++) {
+                    TableCell cell = row.getCell(cn);
+                    xhtml.startElement("td");
+
+                    for (int pn = 0; pn < cell.numParagraphs(); pn++) {
+                        Paragraph cellP = cell.getParagraph(pn);
+                        handleParagraph(cellP, p.getTableLevel(), cell, 
document, docPart, pictures, pictureTable, listManager, xhtml);
+                    }
+                    xhtml.endElement("td");
+                }
+                xhtml.endElement("tr");
+            }
+            xhtml.endElement("tbody");
+            xhtml.endElement("table");
+            return (t.numParagraphs() - 1);
+        }
 
-       String text = p.text();
-       if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
+        String text = p.text();
+        if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
             // Skip empty paragraphs
             return 0;
-       }
+        }
 
-       TagAndStyle tas;
-       String numbering = null;
+        TagAndStyle tas;
+        String numbering = null;
 
-       if (document.getStyleSheet().numStyles()>p.getStyleIndex()) {
-           StyleDescription style =
-              document.getStyleSheet().getStyleDescription(p.getStyleIndex());
-           if (style != null && style.getName() != null && 
style.getName().length() > 0) {
-               if (p.isInList()) {
-                   numbering = listManager.getFormattedNumber(p);
-               }
-               tas = buildParagraphTagAndStyle(style.getName(), 
(parentTableLevel>0));
-           } else {
-               tas = new TagAndStyle("p", null);
-           }
-       } else {
-           tas = new TagAndStyle("p", null);
-       }
-
-       if(tas.getStyleClass() != null) {
-           xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
-       } else {
-           xhtml.startElement(tas.getTag());
-       }
-
-       if (numbering != null) {
-           xhtml.characters(numbering);
-       }
-
-       for(int j=0; j<p.numCharacterRuns(); j++) {
-          CharacterRun cr = p.getCharacterRun(j);
-
-          // FIELD_BEGIN_MARK:
-          if (cr.text().getBytes(IOUtils.UTF_8)[0] == 0x13) {
-             Field field = document.getFields().getFieldByStartOffset(docPart, 
cr.getStartOffset());
-             // 58 is an embedded document
-             // 56 is a document link
-             if (field != null && (field.getType() == 58 || field.getType() == 
56)) {
-               // Embedded Object: add a <div
-               // class="embedded" id="_X"/> so consumer can see where
-               // in the main text each embedded document
-               // occurred:
-               String id = "_" + 
field.getMarkSeparatorCharacterRun(r).getPicOffset();
-               AttributesImpl attributes = new AttributesImpl();
-               attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
-               attributes.addAttribute("", "id", "id", "CDATA", id);
-               xhtml.startElement("div", attributes);
-               xhtml.endElement("div");
-             }
-          }
-
-          if(cr.text().equals("\u0013")) {
-             j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, 
xhtml);
-          } else if(cr.text().startsWith("\u0008")) {
-             // Floating Picture(s)
-             for(int pn=0; pn<cr.text().length(); pn++) {
-                // Assume they're in the order from the unclaimed list...
-                Picture picture = pictures.nextUnclaimed();
+        if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
+            StyleDescription style =
+                    
document.getStyleSheet().getStyleDescription(p.getStyleIndex());
+            if (style != null && style.getName() != null && 
style.getName().length() > 0) {
+                if (p.isInList()) {
+                    numbering = listManager.getFormattedNumber(p);
+                }
+                tas = buildParagraphTagAndStyle(style.getName(), 
(parentTableLevel > 0));
+            } else {
+                tas = new TagAndStyle("p", null);
+            }
+        } else {
+            tas = new TagAndStyle("p", null);
+        }
 
-                // Output
+        if (tas.getStyleClass() != null) {
+            xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
+        } else {
+            xhtml.startElement(tas.getTag());
+        }
+
+        if (numbering != null) {
+            xhtml.characters(numbering);
+        }
+
+        for (int j = 0; j < p.numCharacterRuns(); j++) {
+            CharacterRun cr = p.getCharacterRun(j);
+
+            // FIELD_BEGIN_MARK:
+            if (cr.text().getBytes(IOUtils.UTF_8)[0] == 0x13) {
+                Field field = 
document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
+                // 58 is an embedded document
+                // 56 is a document link
+                if (field != null && (field.getType() == 58 || field.getType() 
== 56)) {
+                    // Embedded Object: add a <div
+                    // class="embedded" id="_X"/> so consumer can see where
+                    // in the main text each embedded document
+                    // occurred:
+                    String id = "_" + 
field.getMarkSeparatorCharacterRun(r).getPicOffset();
+                    AttributesImpl attributes = new AttributesImpl();
+                    attributes.addAttribute("", "class", "class", "CDATA", 
"embedded");
+                    attributes.addAttribute("", "id", "id", "CDATA", id);
+                    xhtml.startElement("div", attributes);
+                    xhtml.endElement("div");
+                }
+            }
+
+            if (cr.text().equals("\u0013")) {
+                j += handleSpecialCharacterRuns(p, j, tas.isHeading(), 
pictures, xhtml);
+            } else if (cr.text().startsWith("\u0008")) {
+                // Floating Picture(s)
+                for (int pn = 0; pn < cr.text().length(); pn++) {
+                    // Assume they're in the order from the unclaimed list...
+                    Picture picture = pictures.nextUnclaimed();
+
+                    // Output
+                    handlePictureCharacterRun(cr, picture, pictures, xhtml);
+                }
+            } else if (pictureTable.hasPicture(cr)) {
+                // Inline Picture
+                Picture picture = pictures.getFor(cr);
                 handlePictureCharacterRun(cr, picture, pictures, xhtml);
-             }
-          } else if(pictureTable.hasPicture(cr)) {
-             // Inline Picture
-             Picture picture = pictures.getFor(cr);
-             handlePictureCharacterRun(cr, picture, pictures, xhtml);
-          } else {
-             handleCharacterRun(cr, tas.isHeading(), xhtml);
-          }
-       }
-
-       // Close any still open style tags
-       if (curStrikeThrough) {
-         xhtml.endElement("s");
-         curStrikeThrough = false;
-       }
-       if (curItalic) {
-         xhtml.endElement("i");
-         curItalic = false;
-       }
-       if (curBold) {
-         xhtml.endElement("b");
-         curBold = false;
-       }
+            } else {
+                handleCharacterRun(cr, tas.isHeading(), xhtml);
+            }
+        }
+
+        // Close any still open style tags
+        if (curStrikeThrough) {
+            xhtml.endElement("s");
+            curStrikeThrough = false;
+        }
+        if (curItalic) {
+            xhtml.endElement("i");
+            curItalic = false;
+        }
+        if (curBold) {
+            xhtml.endElement("b");
+            curBold = false;
+        }
 
-       xhtml.endElement(tas.getTag());
+        xhtml.endElement(tas.getTag());
 
-       return 0;
+        return 0;
     }
 
     private void handleCharacterRun(CharacterRun cr, boolean skipStyling, 
XHTMLContentHandler xhtml)
-          throws SAXException {
-       // Skip trailing newlines
-       if(!isRendered(cr) || cr.text().equals("\r"))
-          return;
-
-       if(!skipStyling) {
-         if (cr.isBold() != curBold) {
-           // Enforce nesting -- must close s and i tags
-           if (curStrikeThrough) {
-             xhtml.endElement("s");
-             curStrikeThrough = false;
-           }
-           if (curItalic) {
-             xhtml.endElement("i");
-             curItalic = false;
-           }
-           if (cr.isBold()) {
-             xhtml.startElement("b");
-           } else {
-             xhtml.endElement("b");
-           }
-           curBold = cr.isBold();
-         }
-
-         if (cr.isItalic() != curItalic) {
-           // Enforce nesting -- must close s tag
-           if (curStrikeThrough) {
-             xhtml.endElement("s");
-             curStrikeThrough = false;
-           }
-           if (cr.isItalic()) {
-             xhtml.startElement("i");
-           } else {
-             xhtml.endElement("i");
-           }
-           curItalic = cr.isItalic();
-         }
-
-         if (cr.isStrikeThrough() != curStrikeThrough) {
-           if (cr.isStrikeThrough()) {
-             xhtml.startElement("s");
-           } else {
-             xhtml.endElement("s");
-           }
-           curStrikeThrough = cr.isStrikeThrough();
-         }
-       }
-
-       // Clean up the text
-       String text = cr.text();
-       text = text.replace('\r', '\n');
-       if(text.endsWith("\u0007")) {
-          // Strip the table cell end marker
-          text = text.substring(0, text.length()-1);
-       }
-
-       // Copied from POI's 
org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:
-
-       // Non-breaking hyphens are returned as char 30
-       text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);
-
-       // Non-required hyphens to zero-width space
-       text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE);
-
-       // Control characters as line break
-       text = text.replaceAll("[\u0000-\u001f]", "\n");
-       xhtml.characters(text);
+            throws SAXException {
+        // Skip trailing newlines
+        if (!isRendered(cr) || cr.text().equals("\r"))
+            return;
+
+        if (!skipStyling) {
+            if (cr.isBold() != curBold) {
+                // Enforce nesting -- must close s and i tags
+                if (curStrikeThrough) {
+                    xhtml.endElement("s");
+                    curStrikeThrough = false;
+                }
+                if (curItalic) {
+                    xhtml.endElement("i");
+                    curItalic = false;
+                }
+                if (cr.isBold()) {
+                    xhtml.startElement("b");
+                } else {
+                    xhtml.endElement("b");
+                }
+                curBold = cr.isBold();
+            }
+
+            if (cr.isItalic() != curItalic) {
+                // Enforce nesting -- must close s tag
+                if (curStrikeThrough) {
+                    xhtml.endElement("s");
+                    curStrikeThrough = false;
+                }
+                if (cr.isItalic()) {
+                    xhtml.startElement("i");
+                } else {
+                    xhtml.endElement("i");
+                }
+                curItalic = cr.isItalic();
+            }
+
+            if (cr.isStrikeThrough() != curStrikeThrough) {
+                if (cr.isStrikeThrough()) {
+                    xhtml.startElement("s");
+                } else {
+                    xhtml.endElement("s");
+                }
+                curStrikeThrough = cr.isStrikeThrough();
+            }
+        }
+
+        // Clean up the text
+        String text = cr.text();
+        text = text.replace('\r', '\n');
+        if (text.endsWith("\u0007")) {
+            // Strip the table cell end marker
+            text = text.substring(0, text.length() - 1);
+        }
+
+        // Copied from POI's 
org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:
+
+        // Non-breaking hyphens are returned as char 30
+        text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);
+
+        // Non-required hyphens to zero-width space
+        text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE);
+
+        // Control characters as line break
+        text = text.replaceAll("[\u0000-\u001f]", "\n");
+        xhtml.characters(text);
     }
+
     /**
      * Can be \13..text..\15 or \13..control..\14..text..\15 .
      * Nesting is allowed
      */
     private int handleSpecialCharacterRuns(Paragraph p, int index, boolean 
skipStyling,
-          PicturesSource pictures, XHTMLContentHandler xhtml) throws 
SAXException, TikaException, IOException {
-       List<CharacterRun> controls = new ArrayList<CharacterRun>();
-       List<CharacterRun> texts = new ArrayList<CharacterRun>();
-       boolean has14 = false;
-
-       // Split it into before and after the 14
-       int i;
-       for(i=index+1; i<p.numCharacterRuns(); i++) {
-          CharacterRun cr = p.getCharacterRun(i);
-          if(cr.text().equals("\u0013")) {
-             // Nested, oh joy...
-             int increment = handleSpecialCharacterRuns(p, i+1, skipStyling, 
pictures, xhtml);
-             i += increment;
-          } else if(cr.text().equals("\u0014")) {
-             has14 = true;
-          } else if(cr.text().equals("\u0015")) {
-             if(!has14) {
-                texts = controls;
-                controls = new ArrayList<CharacterRun>();
-             }
-             break;
-          } else {
-             if(has14) {
-                texts.add(cr);
-             } else {
-                controls.add(cr);
-             }
-          }
-       }
-
-       // Do we need to do something special with this?
-       if(controls.size() > 0) {
-          String text = controls.get(0).text();
-          for(int j=1; j<controls.size(); j++) {
-             text += controls.get(j).text();
-          }
-
-          if((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK"))
-                 && text.indexOf('"') > -1) {
-              int start = text.indexOf('"') + 1;
-              int end = findHyperlinkEnd(text, start);
-              String url = "";
-              if (start >= 0 && start < end && end <= text.length()) {
-                  url = text.substring(start, end);
-              }
-
-             xhtml.startElement("a", "href", url);
-             for(CharacterRun cr : texts) {
-                handleCharacterRun(cr, skipStyling, xhtml);
-             }
-             xhtml.endElement("a");
-          } else {
-             // Just output the text ones
-             for(CharacterRun cr : texts) {
-                if(pictures.hasPicture(cr)) {
-                   Picture picture = pictures.getFor(cr);
-                   handlePictureCharacterRun(cr, picture, pictures, xhtml);
+                                           PicturesSource pictures, 
XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException {
+        List<CharacterRun> controls = new ArrayList<CharacterRun>();
+        List<CharacterRun> texts = new ArrayList<CharacterRun>();
+        boolean has14 = false;
+
+        // Split it into before and after the 14
+        int i;
+        for (i = index + 1; i < p.numCharacterRuns(); i++) {
+            CharacterRun cr = p.getCharacterRun(i);
+            if (cr.text().equals("\u0013")) {
+                // Nested, oh joy...
+                int increment = handleSpecialCharacterRuns(p, i + 1, 
skipStyling, pictures, xhtml);
+                i += increment;
+            } else if (cr.text().equals("\u0014")) {
+                has14 = true;
+            } else if (cr.text().equals("\u0015")) {
+                if (!has14) {
+                    texts = controls;
+                    controls = new ArrayList<CharacterRun>();
+                }
+                break;
+            } else {
+                if (has14) {
+                    texts.add(cr);
                 } else {
-                   handleCharacterRun(cr, skipStyling, xhtml);
+                    controls.add(cr);
                 }
-             }
-          }
-       } else {
-          // We only had text
-          // Output as-is
-          for(CharacterRun cr : texts) {
-             handleCharacterRun(cr, skipStyling, xhtml);
-          }
-       }
+            }
+        }
+
+        // Do we need to do something special with this?
+        if (controls.size() > 0) {
+            String text = controls.get(0).text();
+            for (int j = 1; j < controls.size(); j++) {
+                text += controls.get(j).text();
+            }
+
+            if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK"))
+                    && text.indexOf('"') > -1) {
+                int start = text.indexOf('"') + 1;
+                int end = findHyperlinkEnd(text, start);
+                String url = "";
+                if (start >= 0 && start < end && end <= text.length()) {
+                    url = text.substring(start, end);
+                }
+
+                xhtml.startElement("a", "href", url);
+                for (CharacterRun cr : texts) {
+                    handleCharacterRun(cr, skipStyling, xhtml);
+                }
+                xhtml.endElement("a");
+            } else {
+                // Just output the text ones
+                for (CharacterRun cr : texts) {
+                    if (pictures.hasPicture(cr)) {
+                        Picture picture = pictures.getFor(cr);
+                        handlePictureCharacterRun(cr, picture, pictures, 
xhtml);
+                    } else {
+                        handleCharacterRun(cr, skipStyling, xhtml);
+                    }
+                }
+            }
+        } else {
+            // We only had text
+            // Output as-is
+            for (CharacterRun cr : texts) {
+                handleCharacterRun(cr, skipStyling, xhtml);
+            }
+        }
 
-       // Tell them how many to skip over
-       return i-index;
+        // Tell them how many to skip over
+        return i - index;
     }
 
     //temporary work around for TIKA-1512
@@ -478,48 +530,48 @@ public class WordExtractor extends Abstr
     }
 
     private void handlePictureCharacterRun(CharacterRun cr, Picture picture, 
PicturesSource pictures, XHTMLContentHandler xhtml)
-          throws SAXException, IOException, TikaException {
-       if(!isRendered(cr) || picture == null) {
-          // Oh dear, we've run out...
-          // Probably caused by multiple \u0008 images referencing
-          //  the same real image
-          return;
-       }
-
-       // Which one is it?
-       String extension = picture.suggestFileExtension();
-       int pictureNumber = pictures.pictureNumber(picture);
-
-       // Make up a name for the picture
-       // There isn't one in the file, but we need to be able to reference
-       //  the picture from the img tag and the embedded resource
-       String filename = "image"+pictureNumber+(extension.length()>0 ? 
"."+extension : "");
-
-       // Grab the mime type for the picture
-       String mimeType = picture.getMimeType();
-
-       // Output the img tag
-       AttributesImpl attr = new AttributesImpl();
-       attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
-       attr.addAttribute("", "alt", "alt", "CDATA", filename);
-       xhtml.startElement("img", attr);
-       xhtml.endElement("img");
-
-       // Have we already output this one?
-       // (Only expose each individual image once)
-       if(! pictures.hasOutput(picture)) {
-          TikaInputStream stream = TikaInputStream.get(picture.getContent());
-          handleEmbeddedResource(stream, filename, null, mimeType, xhtml, 
false);
-          pictures.recordOutput(picture);
-       }
+            throws SAXException, IOException, TikaException {
+        if (!isRendered(cr) || picture == null) {
+            // Oh dear, we've run out...
+            // Probably caused by multiple \u0008 images referencing
+            //  the same real image
+            return;
+        }
+
+        // Which one is it?
+        String extension = picture.suggestFileExtension();
+        int pictureNumber = pictures.pictureNumber(picture);
+
+        // Make up a name for the picture
+        // There isn't one in the file, but we need to be able to reference
+        //  the picture from the img tag and the embedded resource
+        String filename = "image" + pictureNumber + (extension.length() > 0 ? 
"." + extension : "");
+
+        // Grab the mime type for the picture
+        String mimeType = picture.getMimeType();
+
+        // Output the img tag
+        AttributesImpl attr = new AttributesImpl();
+        attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
+        attr.addAttribute("", "alt", "alt", "CDATA", filename);
+        xhtml.startElement("img", attr);
+        xhtml.endElement("img");
+
+        // Have we already output this one?
+        // (Only expose each individual image once)
+        if (!pictures.hasOutput(picture)) {
+            TikaInputStream stream = TikaInputStream.get(picture.getContent());
+            handleEmbeddedResource(stream, filename, null, mimeType, xhtml, 
false);
+            pictures.recordOutput(picture);
+        }
     }
 
     /**
      * Outputs a section of text if the given text is non-empty.
      *
-     * @param xhtml XHTML content handler
+     * @param xhtml   XHTML content handler
      * @param section the class of the &lt;div/&gt; section emitted
-     * @param text text to be emitted, if any
+     * @param text    text to be emitted, if any
      * @throws SAXException if an error occurs
      */
     private void addTextIfAny(
@@ -544,77 +596,11 @@ public class WordExtractor extends Abstr
         HWPFOldDocument doc = new HWPFOldDocument(root);
         Word6Extractor extractor = new Word6Extractor(doc);
 
-        for(String p : extractor.getParagraphText()) {
+        for (String p : extractor.getParagraphText()) {
             xhtml.element("p", p);
         }
     }
 
-    private static final Map<String,TagAndStyle> fixedParagraphStyles = new 
HashMap<String,TagAndStyle>();
-    private static final TagAndStyle defaultParagraphStyle = new 
TagAndStyle("p", null);
-    static {
-        fixedParagraphStyles.put("Default", defaultParagraphStyle);
-        fixedParagraphStyles.put("Normal", defaultParagraphStyle);
-        fixedParagraphStyles.put("heading", new TagAndStyle("h1", null));
-        fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null));
-        fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title"));
-        fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", 
"subtitle"));
-        fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", 
null));
-    }
-
-    /**
-     * Given a style name, return what tag should be used, and
-     *  what style should be applied to it.
-     */
-    public static TagAndStyle buildParagraphTagAndStyle(String styleName, 
boolean isTable) {
-       TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName);
-       if (tagAndStyle != null) {
-           return tagAndStyle;
-       }
-
-       if (styleName.equals("Table Contents") && isTable) {
-           return defaultParagraphStyle;
-       }
-
-       String tag = "p";
-       String styleClass = null;
-
-       if(styleName.startsWith("heading") || styleName.startsWith("Heading")) {
-           // "Heading 3" or "Heading2" or "heading 4"
-           int num = 1;
-           try {
-               num = Integer.parseInt(
-                                      styleName.substring(styleName.length()-1)
-                                      );
-           } catch(NumberFormatException e) {}
-           // Turn it into a H1 - H6 (H7+ isn't valid!)
-           tag = "h" + Math.min(num, 6);
-       } else {
-           styleClass = styleName.replace(' ', '_');
-           styleClass = styleClass.substring(0,1).toLowerCase(Locale.ROOT) +
-               styleClass.substring(1);
-       }
-
-       return new TagAndStyle(tag,styleClass);
-    }
-
-    public static class TagAndStyle {
-       private String tag;
-       private String styleClass;
-       public TagAndStyle(String tag, String styleClass) {
-         this.tag = tag;
-         this.styleClass = styleClass;
-       }
-       public String getTag() {
-         return tag;
-       }
-       public String getStyleClass() {
-         return styleClass;
-       }
-       public boolean isHeading() {
-          return tag.length()==2 && tag.startsWith("h");
-       }
-    }
-
     /**
      * Determines if character run should be included in the extraction.
      *
@@ -622,81 +608,103 @@ public class WordExtractor extends Abstr
      * @return true if character run should be included in extraction.
      */
     private boolean isRendered(final CharacterRun cr) {
-          return cr == null || !cr.isMarkedDeleted();
+        return cr == null || !cr.isMarkedDeleted();
     }
 
+    public static class TagAndStyle {
+        private String tag;
+        private String styleClass;
+
+        public TagAndStyle(String tag, String styleClass) {
+            this.tag = tag;
+            this.styleClass = styleClass;
+        }
+
+        public String getTag() {
+            return tag;
+        }
+
+        public String getStyleClass() {
+            return styleClass;
+        }
+
+        public boolean isHeading() {
+            return tag.length() == 2 && tag.startsWith("h");
+        }
+    }
 
     /**
      * Provides access to the pictures both by offset, iteration
-     *  over the un-claimed, and peeking forward
+     * over the un-claimed, and peeking forward
      */
     private static class PicturesSource {
-       private PicturesTable picturesTable;
-       private Set<Picture> output = new HashSet<Picture>();
-       private Map<Integer,Picture> lookup;
-       private List<Picture> nonU1based;
-       private List<Picture> all;
-       private int pn = 0;
-
-       private PicturesSource(HWPFDocument doc) {
-          picturesTable = doc.getPicturesTable();
-          all = picturesTable.getAllPictures();
-
-          // Build the Offset-Picture lookup map
-          lookup = new HashMap<Integer, Picture>();
-          for(Picture p : all) {
-             lookup.put(p.getStartOffset(), p);
-          }
-
-          // Work out which Pictures aren't referenced by
-          //  a \u0001 in the main text
-          // These are \u0008 escher floating ones, ones
-          //  found outside the normal text, and who
-          //  knows what else...
-          nonU1based = new ArrayList<Picture>();
-          nonU1based.addAll(all);
-          Range r = doc.getRange();
-          for(int i=0; i<r.numCharacterRuns(); i++) {
-             CharacterRun cr = r.getCharacterRun(i);
-             if(picturesTable.hasPicture(cr)) {
-                Picture p = getFor(cr);
-                int at = nonU1based.indexOf(p);
-                nonU1based.set(at, null);
-             }
-          }
-       }
-
-       private boolean hasPicture(CharacterRun cr) {
-          return picturesTable.hasPicture(cr);
-       }
-
-       private void recordOutput(Picture picture) {
-          output.add(picture);
-       }
-       private boolean hasOutput(Picture picture) {
-          return output.contains(picture);
-       }
-
-       private int pictureNumber(Picture picture) {
-          return all.indexOf(picture) + 1;
-       }
-
-       private Picture getFor(CharacterRun cr) {
-          return lookup.get(cr.getPicOffset());
-       }
-
-       /**
-        * Return the next unclaimed one, used towards
-        *  the end
-        */
-       private Picture nextUnclaimed() {
-          Picture p = null;
-          while(pn < nonU1based.size()) {
-             p = nonU1based.get(pn);
-             pn++;
-             if(p != null) return p;
-          }
-          return null;
-       }
+        private PicturesTable picturesTable;
+        private Set<Picture> output = new HashSet<Picture>();
+        private Map<Integer, Picture> lookup;
+        private List<Picture> nonU1based;
+        private List<Picture> all;
+        private int pn = 0;
+
+        private PicturesSource(HWPFDocument doc) {
+            picturesTable = doc.getPicturesTable();
+            all = picturesTable.getAllPictures();
+
+            // Build the Offset-Picture lookup map
+            lookup = new HashMap<Integer, Picture>();
+            for (Picture p : all) {
+                lookup.put(p.getStartOffset(), p);
+            }
+
+            // Work out which Pictures aren't referenced by
+            //  a \u0001 in the main text
+            // These are \u0008 escher floating ones, ones
+            //  found outside the normal text, and who
+            //  knows what else...
+            nonU1based = new ArrayList<Picture>();
+            nonU1based.addAll(all);
+            Range r = doc.getRange();
+            for (int i = 0; i < r.numCharacterRuns(); i++) {
+                CharacterRun cr = r.getCharacterRun(i);
+                if (picturesTable.hasPicture(cr)) {
+                    Picture p = getFor(cr);
+                    int at = nonU1based.indexOf(p);
+                    nonU1based.set(at, null);
+                }
+            }
+        }
+
+        private boolean hasPicture(CharacterRun cr) {
+            return picturesTable.hasPicture(cr);
+        }
+
+        private void recordOutput(Picture picture) {
+            output.add(picture);
+        }
+
+        private boolean hasOutput(Picture picture) {
+            return output.contains(picture);
+        }
+
+        private int pictureNumber(Picture picture) {
+            return all.indexOf(picture) + 1;
+        }
+
+        private Picture getFor(CharacterRun cr) {
+            return lookup.get(cr.getPicOffset());
+        }
+
+        /**
+         * Return the next unclaimed one, used towards
+         * the end
+         */
+        private Picture nextUnclaimed() {
+            Picture p = null;
+            while (pn < nonU1based.size()) {
+                p = nonU1based.get(pn);
+                pn++;
+                if (p != null) return p;
+            }
+            return null;
+        }
     }
 }


Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 Fri May 29 14:36:21 2015
@@ -53,7 +53,7 @@ import org.xml.sax.helpers.AttributesImp
 
 /**
  * Base class for all Tika OOXML extractors.
- * 
+ * <p/>
  * Tika extractors decorate POI extractors so that the parsed content of
  * documents is returned as a sequence of XHTML SAX events. Subclasses must
  * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} 
that
@@ -67,17 +67,15 @@ public abstract class AbstractOOXMLExtra
 
     private static final String TYPE_OLE_OBJECT =
             "application/vnd.openxmlformats-officedocument.oleObject";
-
-    protected POIXMLTextExtractor extractor;
-
     private final EmbeddedDocumentExtractor embeddedExtractor;
+    protected POIXMLTextExtractor extractor;
 
     public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor 
extractor) {
         this.extractor = extractor;
 
         EmbeddedDocumentExtractor ex = 
context.get(EmbeddedDocumentExtractor.class);
 
-        if (ex==null) {
+        if (ex == null) {
             embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
         } else {
             embeddedExtractor = ex;
@@ -101,7 +99,7 @@ public abstract class AbstractOOXMLExtra
 
     /**
      * @see 
org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
-     *      org.apache.tika.metadata.Metadata)
+     * org.apache.tika.metadata.Metadata)
      */
     public void getXHTML(
             ContentHandler handler, Metadata metadata, ParseContext context)
@@ -113,55 +111,55 @@ public abstract class AbstractOOXMLExtra
 
         // Now do any embedded parts
         handleEmbeddedParts(handler);
-        
+
         // thumbnail
         handleThumbnail(handler);
 
         xhtml.endDocument();
     }
-  
+
     protected String getJustFileName(String desc) {
-      int idx = desc.lastIndexOf('/');
-      if (idx != -1) {
-        desc = desc.substring(idx+1);
-      }
-      idx = desc.lastIndexOf('.');
-      if (idx != -1) {
-        desc = desc.substring(0, idx);
-      }
+        int idx = desc.lastIndexOf('/');
+        if (idx != -1) {
+            desc = desc.substring(idx + 1);
+        }
+        idx = desc.lastIndexOf('.');
+        if (idx != -1) {
+            desc = desc.substring(0, idx);
+        }
 
-      return desc;
+        return desc;
     }
-    
-    private void handleThumbnail( ContentHandler handler ) {
+
+    private void handleThumbnail(ContentHandler handler) {
         try {
             OPCPackage opcPackage = extractor.getPackage();
-            for (PackageRelationship rel : opcPackage.getRelationshipsByType( 
PackageRelationshipTypes.THUMBNAIL )) {
+            for (PackageRelationship rel : 
opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) {
                 PackagePart tPart = opcPackage.getPart(rel);
                 InputStream tStream = tPart.getInputStream();
-                Metadata thumbnailMetadata = new Metadata();                
+                Metadata thumbnailMetadata = new Metadata();
                 String thumbName = tPart.getPartName().getName();
                 thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName);
-                
+
                 AttributesImpl attributes = new AttributesImpl();
                 attributes.addAttribute(XHTML, "class", "class", "CDATA", 
"embedded");
                 attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName);
                 handler.startElement(XHTML, "div", "div", attributes);
                 handler.endElement(XHTML, "div", "div");
-                
+
                 thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, 
thumbName);
                 thumbnailMetadata.set(Metadata.CONTENT_TYPE, 
tPart.getContentType());
                 thumbnailMetadata.set(TikaCoreProperties.TITLE, 
tPart.getPartName().getName());
-                
+
                 if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
                     
embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new 
EmbeddedContentHandler(handler), thumbnailMetadata, false);
                 }
-                
+
                 tStream.close();
             }
-         } catch (Exception ex) {
-             
-         }
+        } catch (Exception ex) {
+
+        }
     }
 
     private void handleEmbeddedParts(ContentHandler handler)
@@ -175,9 +173,9 @@ public abstract class AbstractOOXMLExtra
                     if (sourceURI != null) {
                         sourceDesc = getJustFileName(sourceURI.getPath());
                         if (sourceDesc.startsWith("slide")) {
-                          sourceDesc += "_";
+                            sourceDesc += "_";
                         } else {
-                          sourceDesc = "";
+                            sourceDesc = "";
                         }
                     } else {
                         sourceDesc = "";
@@ -215,11 +213,11 @@ public abstract class AbstractOOXMLExtra
     private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, 
String rel)
             throws IOException, SAXException {
         // A POIFSFileSystem needs to be at least 3 blocks big to be valid
-        if (part.getSize() >= 0 && part.getSize() < 512*3) {
-           // Too small, skip
-           return;
+        if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
+            // Too small, skip
+            return;
         }
-       
+
         // Open the POIFS (OLE2) structure and process
         POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
         try {
@@ -229,19 +227,19 @@ public abstract class AbstractOOXMLExtra
 
             DirectoryNode root = fs.getRoot();
             POIFSDocumentType type = POIFSDocumentType.detectType(root);
-            
+
             if (root.hasEntry("CONTENTS")
-                  && root.hasEntry("\u0001Ole")
-                  && root.hasEntry("\u0001CompObj")
-                  && root.hasEntry("\u0003ObjInfo")) {
-               // TIKA-704: OLE 2.0 embedded non-Office document?
-               stream = TikaInputStream.get(
-                     fs.createDocumentInputStream("CONTENTS"));
-               if (embeddedExtractor.shouldParseEmbedded(metadata)) {
-                  embeddedExtractor.parseEmbedded(
-                        stream, new EmbeddedContentHandler(handler),
-                        metadata, false);
-               }
+                    && root.hasEntry("\u0001Ole")
+                    && root.hasEntry("\u0001CompObj")
+                    && root.hasEntry("\u0003ObjInfo")) {
+                // TIKA-704: OLE 2.0 embedded non-Office document?
+                stream = TikaInputStream.get(
+                        fs.createDocumentInputStream("CONTENTS"));
+                if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+                    embeddedExtractor.parseEmbedded(
+                            stream, new EmbeddedContentHandler(handler),
+                            metadata, false);
+                }
             } else if (POIFSDocumentType.OLE10_NATIVE == type) {
                 // TIKA-704: OLE 1.0 embedded document
                 Ole10Native ole =
@@ -302,12 +300,12 @@ public abstract class AbstractOOXMLExtra
      */
     protected abstract void buildXHTML(XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException;
-    
+
     /**
      * Return a list of the main parts of the document, used
-     *  when searching for embedded resources.
+     * when searching for embedded resources.
      * This should be all the parts of the document that end
-     *  up with things embedded into them.
+     * up with things embedded into them.
      */
     protected abstract List<PackagePart> getMainDocumentParts()
             throws TikaException;

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 Fri May 29 14:36:21 2015
@@ -19,10 +19,10 @@ package org.apache.tika.parser.microsoft
 import java.math.BigDecimal;
 import java.util.Date;
 
-import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.POIXMLProperties.CoreProperties;
 import org.apache.poi.POIXMLProperties.CustomProperties;
 import org.apache.poi.POIXMLProperties.ExtendedProperties;
+import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
 import org.apache.poi.openxml4j.util.Nullable;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
@@ -40,9 +40,9 @@ import org.openxmlformats.schemas.office
 
 /**
  * OOXML metadata extractor.
- * 
+ * <p/>
  * Currently POI doesn't support metadata extraction for OOXML.
- * 
+ *
  * @see OOXMLExtractor#getMetadataExtractor()
  */
 public class MetadataExtractor {
@@ -55,8 +55,8 @@ public class MetadataExtractor {
 
     public void extract(Metadata metadata) throws TikaException {
         if (extractor.getDocument() != null ||
-              (extractor instanceof XSSFEventBasedExcelExtractor && 
-               extractor.getPackage() != null)) {
+                (extractor instanceof XSSFEventBasedExcelExtractor &&
+                        extractor.getPackage() != null)) {
             extractMetadata(extractor.getCoreProperties(), metadata);
             extractMetadata(extractor.getExtendedProperties(), metadata);
             extractMetadata(extractor.getCustomProperties(), metadata);
@@ -89,15 +89,15 @@ public class MetadataExtractor {
         addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
                 .getModifiedProperty());
         addProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder
-              .getModifiedProperty());
+                .getModifiedProperty());
         addProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder
                 .getRevisionProperty());
         // TODO: Move to OO subject in Tika 2.0
-        addProperty(metadata, 
TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, 
+        addProperty(metadata, 
TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
                 propsHolder.getSubjectProperty());
         addProperty(metadata, TikaCoreProperties.TITLE, 
propsHolder.getTitleProperty());
         addProperty(metadata, OfficeOpenXMLCore.VERSION, 
propsHolder.getVersionProperty());
-        
+
         // Legacy Tika-1.0 style stats
         // TODO Remove these in Tika 2.0
         addProperty(metadata, Metadata.CATEGORY, 
propsHolder.getCategoryProperty());
@@ -109,7 +109,7 @@ public class MetadataExtractor {
     }
 
     private void extractMetadata(ExtendedProperties properties,
-            Metadata metadata) {
+                                 Metadata metadata) {
         CTProperties propsHolder = properties.getUnderlyingProperties();
 
         addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, 
propsHolder.getApplication());
@@ -123,9 +123,9 @@ public class MetadataExtractor {
         addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, 
propsHolder.getTotalTime());
 
         if (propsHolder.getPages() > 0) {
-           metadata.set(PagedText.N_PAGES, propsHolder.getPages());
+            metadata.set(PagedText.N_PAGES, propsHolder.getPages());
         } else if (propsHolder.getSlides() > 0) {
-           metadata.set(PagedText.N_PAGES, propsHolder.getSlides());
+            metadata.set(PagedText.N_PAGES, propsHolder.getSlides());
         }
 
         // Process the document statistics
@@ -136,7 +136,7 @@ public class MetadataExtractor {
         addProperty(metadata, Office.WORD_COUNT, propsHolder.getWords());
         addProperty(metadata, Office.CHARACTER_COUNT, 
propsHolder.getCharacters());
         addProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, 
propsHolder.getCharactersWithSpaces());
-        
+
         // Legacy Tika-1.0 style stats
         // TODO Remove these in Tika 2.0
         addProperty(metadata, Metadata.APPLICATION_NAME, 
propsHolder.getApplication());
@@ -156,113 +156,89 @@ public class MetadataExtractor {
     }
 
     private void extractMetadata(CustomProperties properties,
-          Metadata metadata) {
-       
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
-           props = properties.getUnderlyingProperties();
-       for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
-          CTProperty property = props.getPropertyArray(i);
-          String val = null;
-          Date date = null;
-
-          if (property.isSetLpwstr()) {
-             val = property.getLpwstr(); 
-          }
-          else if (property.isSetLpstr()) {
-             val = property.getLpstr(); 
-          }
-          else if (property.isSetDate()) {
-             date = property.getDate().getTime(); 
-          }
-          else if (property.isSetFiletime()) {
-             date = property.getFiletime().getTime(); 
-          }
-
-          else if (property.isSetBool()) {
-             val = Boolean.toString( property.getBool() );
-          }
-
-          // Integers
-          else if (property.isSetI1()) {
-             val = Integer.toString(property.getI1()); 
-          }
-          else if (property.isSetI2()) {
-             val = Integer.toString(property.getI2()); 
-          }
-          else if (property.isSetI4()) {
-             val = Integer.toString(property.getI4()); 
-          }
-          else if (property.isSetI8()) {
-             val = Long.toString(property.getI8()); 
-          }
-          else if (property.isSetInt()) {
-             val = Integer.toString( property.getInt() ); 
-          }
-
-          // Unsigned Integers
-          else if (property.isSetUi1()) {
-             val = Integer.toString(property.getUi1()); 
-          }
-          else if (property.isSetUi2()) {
-             val = Integer.toString(property.getUi2()); 
-          }
-          else if (property.isSetUi4()) {
-             val = Long.toString(property.getUi4()); 
-          }
-          else if (property.isSetUi8()) {
-             val = property.getUi8().toString(); 
-          }
-          else if (property.isSetUint()) {
-             val = Long.toString(property.getUint()); 
-          }
-
-          // Reals
-          else if (property.isSetR4()) {
-             val = Float.toString( property.getR4() ); 
-          }
-          else if (property.isSetR8()) {
-             val = Double.toString( property.getR8() ); 
-          }
-          else if (property.isSetDecimal()) {
-             BigDecimal d = property.getDecimal();
-             if (d == null) {
-                val = null;
-             } else {
-                val = d.toPlainString();
-             }
-          }
-
-          else if (property.isSetArray()) {
-             // TODO Fetch the array values and output
-          }
-          else if (property.isSetVector()) {
-             // TODO Fetch the vector values and output
-          }
-
-          else if (property.isSetBlob() || property.isSetOblob()) {
-             // TODO Decode, if possible
-          }
-          else if (property.isSetStream() || property.isSetOstream() ||
-                   property.isSetVstream()) {
-             // TODO Decode, if possible
-          }
-          else if (property.isSetStorage() || property.isSetOstorage()) {
-             // TODO Decode, if possible
-          }
-          
-          else {
-             // This type isn't currently supported yet, skip the property
-          }
-          
-          String propName = "custom:" + property.getName();
-          if (date != null) {
-             Property tikaProp = Property.externalDate(propName);
-             metadata.set(tikaProp, date);
-          } else if (val != null) {
-             metadata.set(propName, val);
-          }
-       }
+                                 Metadata metadata) {
+        
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
+                props = properties.getUnderlyingProperties();
+        for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
+            CTProperty property = props.getPropertyArray(i);
+            String val = null;
+            Date date = null;
+
+            if (property.isSetLpwstr()) {
+                val = property.getLpwstr();
+            } else if (property.isSetLpstr()) {
+                val = property.getLpstr();
+            } else if (property.isSetDate()) {
+                date = property.getDate().getTime();
+            } else if (property.isSetFiletime()) {
+                date = property.getFiletime().getTime();
+            } else if (property.isSetBool()) {
+                val = Boolean.toString(property.getBool());
+            }
+
+            // Integers
+            else if (property.isSetI1()) {
+                val = Integer.toString(property.getI1());
+            } else if (property.isSetI2()) {
+                val = Integer.toString(property.getI2());
+            } else if (property.isSetI4()) {
+                val = Integer.toString(property.getI4());
+            } else if (property.isSetI8()) {
+                val = Long.toString(property.getI8());
+            } else if (property.isSetInt()) {
+                val = Integer.toString(property.getInt());
+            }
+
+            // Unsigned Integers
+            else if (property.isSetUi1()) {
+                val = Integer.toString(property.getUi1());
+            } else if (property.isSetUi2()) {
+                val = Integer.toString(property.getUi2());
+            } else if (property.isSetUi4()) {
+                val = Long.toString(property.getUi4());
+            } else if (property.isSetUi8()) {
+                val = property.getUi8().toString();
+            } else if (property.isSetUint()) {
+                val = Long.toString(property.getUint());
+            }
+
+            // Reals
+            else if (property.isSetR4()) {
+                val = Float.toString(property.getR4());
+            } else if (property.isSetR8()) {
+                val = Double.toString(property.getR8());
+            } else if (property.isSetDecimal()) {
+                BigDecimal d = property.getDecimal();
+                if (d == null) {
+                    val = null;
+                } else {
+                    val = d.toPlainString();
+                }
+            } else if (property.isSetArray()) {
+                // TODO Fetch the array values and output
+            } else if (property.isSetVector()) {
+                // TODO Fetch the vector values and output
+            } else if (property.isSetBlob() || property.isSetOblob()) {
+                // TODO Decode, if possible
+            } else if (property.isSetStream() || property.isSetOstream() ||
+                    property.isSetVstream()) {
+                // TODO Decode, if possible
+            } else if (property.isSetStorage() || property.isSetOstorage()) {
+                // TODO Decode, if possible
+            } else {
+                // This type isn't currently supported yet, skip the property
+            }
+
+            String propName = "custom:" + property.getName();
+            if (date != null) {
+                Property tikaProp = Property.externalDate(propName);
+                metadata.set(tikaProp, date);
+            } else if (val != null) {
+                metadata.set(propName, val);
+            }
+        }
     }
-    
+
     private <T> void addProperty(Metadata metadata, Property property, 
Nullable<T> nullableValue) {
         T value = nullableValue.getValue();
         if (value != null) {
@@ -283,7 +259,7 @@ public class MetadataExtractor {
             addProperty(metadata, name, value.getValue().toString());
         }
     }
-    
+
     private void addProperty(Metadata metadata, Property property, String 
value) {
         if (value != null) {
             metadata.set(property, value);
@@ -297,11 +273,11 @@ public class MetadataExtractor {
     }
 
     private void addProperty(Metadata metadata, Property property, int value) {
-       if (value > 0) {
-           metadata.set(property, value);
-       }
+        if (value > 0) {
+            metadata.set(property, value);
+        }
     }
-    
+
     private void addProperty(Metadata metadata, String name, int value) {
         if (value > 0) {
             metadata.set(name, Integer.toString(value));

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
 Fri May 29 14:36:21 2015
@@ -29,14 +29,14 @@ import org.xml.sax.SAXException;
 
 /**
  * Interface implemented by all Tika OOXML extractors.
- * 
+ *
  * @see org.apache.poi.POIXMLTextExtractor
  */
 public interface OOXMLExtractor {
 
     /**
      * Returns the opened document.
-     * 
+     *
      * @see POIXMLTextExtractor#getDocument()
      */
     POIXMLDocument getDocument();

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 Fri May 29 14:36:21 2015
@@ -56,7 +56,7 @@ public class OOXMLExtractorFactory {
             throws IOException, SAXException, TikaException {
         Locale locale = context.get(Locale.class, Locale.getDefault());
         ExtractorFactory.setThreadPrefersEventExtractors(true);
-        
+
         try {
             OOXMLExtractor extractor;
             OPCPackage pkg;
@@ -66,34 +66,34 @@ public class OOXMLExtractorFactory {
             if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
                 pkg = (OPCPackage) tis.getOpenContainer();
             } else if (tis != null && tis.hasFile()) {
-                pkg = OPCPackage.open( tis.getFile().getPath(), 
PackageAccess.READ );
+                pkg = OPCPackage.open(tis.getFile().getPath(), 
PackageAccess.READ);
                 tis.setOpenContainer(pkg);
             } else {
                 InputStream shield = new CloseShieldInputStream(stream);
-                pkg = OPCPackage.open(shield); 
+                pkg = OPCPackage.open(shield);
             }
-            
+
             // Get the type, and ensure it's one we handle
             MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
             if (type == null || 
OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
-               // Not a supported type, delegate to Empty Parser 
-               EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, 
context);
-               return;
+                // Not a supported type, delegate to Empty Parser
+                EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, 
context);
+                return;
             }
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
 
             // Have the appropriate OOXML text extractor picked
             POIXMLTextExtractor poiExtractor = 
ExtractorFactory.createExtractor(pkg);
-            
+
             POIXMLDocument document = poiExtractor.getDocument();
             if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
-               extractor = new XSSFExcelExtractorDecorator(
-                   context, (XSSFEventBasedExcelExtractor)poiExtractor, 
locale);
+                extractor = new XSSFExcelExtractorDecorator(
+                        context, (XSSFEventBasedExcelExtractor) poiExtractor, 
locale);
             } else if (document == null) {
-               throw new TikaException(
-                     "Expecting UserModel based POI OOXML extractor with a 
document, but none found. " +
-                     "The extractor returned was a " + poiExtractor
-               );
+                throw new TikaException(
+                        "Expecting UserModel based POI OOXML extractor with a 
document, but none found. " +
+                                "The extractor returned was a " + poiExtractor
+                );
             } else if (document instanceof XMLSlideShow) {
                 extractor = new XSLFPowerPointExtractorDecorator(
                         context, (XSLFPowerPointExtractor) poiExtractor);
@@ -103,11 +103,11 @@ public class OOXMLExtractorFactory {
             } else {
                 extractor = new POIXMLTextExtractorDecorator(context, 
poiExtractor);
             }
-            
+
             // Get the bulk of the metadata first, so that it's accessible 
during
             //  parsing if desired by the client (see TIKA-1109)
             extractor.getMetadataExtractor().extract(metadata);
-            
+
             // Extract the text, along with any in-document metadata
             extractor.getXHTML(baseHandler, metadata, context);
         } catch (IllegalArgumentException e) {
@@ -115,7 +115,7 @@ public class OOXMLExtractorFactory {
                     e.getMessage().startsWith("No supported documents found")) 
{
                 throw new TikaException(
                         "TIKA-418: RuntimeException while getting content"
-                        + " for thmx and xps file types", e);
+                                + " for thmx and xps file types", e);
             } else {
                 throw new TikaException("Error creating OOXML extractor", e);
             }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
 Fri May 29 14:36:21 2015
@@ -36,39 +36,39 @@ import org.xml.sax.SAXException;
  */
 public class OOXMLParser extends AbstractParser {
 
-    /** Serial version UID */
-    private static final long serialVersionUID = 6535995710857776481L;
-   
     protected static final Set<MediaType> SUPPORTED_TYPES =
-        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                MediaType.application("x-tika-ooxml"),
-                
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
-                
MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"),
-                
MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"),
-                
MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"),
-                
MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"),
-                
MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"),
-                
MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
-                MediaType.application("vnd.ms-excel.sheet.macroenabled.12"),
-                
MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"),
-                MediaType.application("vnd.ms-excel.template.macroenabled.12"),
-                MediaType.application("vnd.ms-excel.addin.macroenabled.12"),
-                
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"),
-                MediaType.application("vnd.ms-word.document.macroenabled.12"),
-                
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"),
-                
MediaType.application("vnd.ms-word.template.macroenabled.12"))));
-    
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("x-tika-ooxml"),
+                    
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
+                    
MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"),
+                    
MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"),
+                    
MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"),
+                    
MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"),
+                    
MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"),
+                    
MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+                    
MediaType.application("vnd.ms-excel.sheet.macroenabled.12"),
+                    
MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"),
+                    
MediaType.application("vnd.ms-excel.template.macroenabled.12"),
+                    
MediaType.application("vnd.ms-excel.addin.macroenabled.12"),
+                    
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"),
+                    
MediaType.application("vnd.ms-word.document.macroenabled.12"),
+                    
MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"),
+                    
MediaType.application("vnd.ms-word.template.macroenabled.12"))));
     /**
      * We claim to support all OOXML files, but we actually don't support a 
small
-     *  number of them.
+     * number of them.
      * This list is used to decline certain formats that are not yet supported
-     *  by Tika and/or POI.
+     * by Tika and/or POI.
+     */
+    protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    
MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"),
+                    MediaType.application("vnd.ms-xpsdocument")
+            )));
+    /**
+     * Serial version UID
      */
-    protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = 
-       Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                
MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"),
-                MediaType.application("vnd.ms-xpsdocument")
-       )));
+    private static final long serialVersionUID = 6535995710857776481L;
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
 Fri May 29 14:36:21 2015
@@ -39,6 +39,6 @@ public class POIXMLTextExtractorDecorato
 
     @Override
     protected List<PackagePart> getMainDocumentParts() {
-       return new ArrayList<PackagePart>();
+        return new ArrayList<PackagePart>();
     }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 Fri May 29 14:36:21 2015
@@ -70,10 +70,10 @@ public class XSLFPowerPointExtractorDeco
         for (XSLFSlide slide : slides) {
             String slideDesc;
             if (slide.getPackagePart() != null && 
slide.getPackagePart().getPartName() != null) {
-              slideDesc = 
getJustFileName(slide.getPackagePart().getPartName().toString());
-              slideDesc += "_";
+                slideDesc = 
getJustFileName(slide.getPackagePart().getPartName().toString());
+                slideDesc += "_";
             } else {
-              slideDesc = null;
+                slideDesc = null;
             }
 
             // slide
@@ -118,27 +118,27 @@ public class XSLFPowerPointExtractorDeco
                     continue;
                 }
                 xhtml.element("p", txt.getText());
-            } else if (sh instanceof XSLFGroupShape){
+            } else if (sh instanceof XSLFGroupShape) {
                 // recurse into groups of shapes
-                XSLFGroupShape group = (XSLFGroupShape)sh;
+                XSLFGroupShape group = (XSLFGroupShape) sh;
                 extractContent(group.getShapes(), skipPlaceholders, xhtml, 
slideDesc);
             } else if (sh instanceof XSLFTable) {
-                XSLFTable tbl = (XSLFTable)sh;
-                for(XSLFTableRow row : tbl){
+                XSLFTable tbl = (XSLFTable) sh;
+                for (XSLFTableRow row : tbl) {
                     List<XSLFTableCell> cells = row.getCells();
                     extractContent(cells.toArray(new 
XSLFTableCell[cells.size()]), skipPlaceholders, xhtml, slideDesc);
                 }
             } else if (sh instanceof XSLFGraphicFrame) {
                 XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
                 XmlObject[] sp = frame.getXmlObject().selectPath(
-                                   "declare namespace 
p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
+                        "declare namespace 
p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
                 if (sp != null) {
-                    for(XmlObject emb : sp) {
+                    for (XmlObject emb : sp) {
                         XmlObject relIDAtt = emb.selectAttribute(new 
QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships";, 
"id"));
                         if (relIDAtt != null) {
                             String relID = 
relIDAtt.getDomNode().getNodeValue();
                             if (slideDesc != null) {
-                              relID = slideDesc + relID;
+                                relID = slideDesc + relID;
                             }
                             AttributesImpl attributes = new AttributesImpl();
                             attributes.addAttribute("", "class", "class", 
"CDATA", "embedded");
@@ -155,7 +155,7 @@ public class XSLFPowerPointExtractorDeco
                         String relID = 
ctPic.getBlipFill().getBlip().getEmbed();
                         if (relID != null) {
                             if (slideDesc != null) {
-                              relID = slideDesc + relID;
+                                relID = slideDesc + relID;
                             }
                             AttributesImpl attributes = new AttributesImpl();
                             attributes.addAttribute("", "class", "class", 
"CDATA", "embedded");
@@ -168,50 +168,50 @@ public class XSLFPowerPointExtractorDeco
             }
         }
     }
-    
+
     /**
      * In PowerPoint files, slides have things embedded in them,
-     *  and slide drawings which have the images
+     * and slide drawings which have the images
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() throws TikaException {
-       List<PackagePart> parts = new ArrayList<PackagePart>();
-       XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
-       XSLFSlideShow document = null;
-       try {
-          document = slideShow._getXSLFSlideShow(); // TODO Avoid this in 
future
-       } catch(Exception e) {
-          throw new TikaException(e.getMessage()); // Shouldn't happen
-       }
-
-       CTSlideIdList ctSlideIdList = document.getSlideReferences();
-       if (ctSlideIdList != null) {
-           for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) {
-               CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i);
-               // Add the slide
-               PackagePart slidePart;
-               try {
-                   slidePart = document.getSlidePart(ctSlide);
-               } catch (IOException e) {
-                   throw new TikaException("Broken OOXML file", e);
-               } catch (XmlException xe) {
-                   throw new TikaException("Broken OOXML file", xe);
-               }
-               parts.add(slidePart);
-
-               // If it has drawings, return those too
-               try {
-                   for (PackageRelationship rel : 
slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
-                       if (rel.getTargetMode() == TargetMode.INTERNAL) {
-                           PackagePartName relName = 
PackagingURIHelper.createPartName(rel.getTargetURI());
-                           parts.add(rel.getPackage().getPart(relName));
-                       }
-                   }
-               } catch (InvalidFormatException e) {
-                   throw new TikaException("Broken OOXML file", e);
-               }
-           }
-       }
-       return parts;
+        List<PackagePart> parts = new ArrayList<PackagePart>();
+        XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
+        XSLFSlideShow document = null;
+        try {
+            document = slideShow._getXSLFSlideShow(); // TODO Avoid this in 
future
+        } catch (Exception e) {
+            throw new TikaException(e.getMessage()); // Shouldn't happen
+        }
+
+        CTSlideIdList ctSlideIdList = document.getSlideReferences();
+        if (ctSlideIdList != null) {
+            for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) {
+                CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i);
+                // Add the slide
+                PackagePart slidePart;
+                try {
+                    slidePart = document.getSlidePart(ctSlide);
+                } catch (IOException e) {
+                    throw new TikaException("Broken OOXML file", e);
+                } catch (XmlException xe) {
+                    throw new TikaException("Broken OOXML file", xe);
+                }
+                parts.add(slidePart);
+
+                // If it has drawings, return those too
+                try {
+                    for (PackageRelationship rel : 
slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
+                        if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                            PackagePartName relName = 
PackagingURIHelper.createPartName(rel.getTargetURI());
+                            parts.add(rel.getPackage().getPart(relName));
+                        }
+                    }
+                } catch (InvalidFormatException e) {
+                    throw new TikaException("Broken OOXML file", e);
+                }
+            }
+        }
+        return parts;
     }
 }

svn commit: r1682489 [4/14] - in /tika/trunk: tika-parsers/src/main/java/org/apache/tika/parser/html/ tika-parsers/src/main/java/org/apache/tika/parser/image/ tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/ tika-parsers/src/main/java/org/a...

Reply via email to