Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Fri May 29 14:36:21 2015 @@ -59,10 +59,18 @@ public class WordExtractor extends Abstr private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011'; private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b'; // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3 - private static final String LIST_DELIMITER = " "; + private static final String LIST_DELIMITER = " "; + private static final Map<String, TagAndStyle> fixedParagraphStyles = new HashMap<String, TagAndStyle>(); + private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null); - public WordExtractor(ParseContext context) { - super(context); + static { + fixedParagraphStyles.put("Default", defaultParagraphStyle); + fixedParagraphStyles.put("Normal", defaultParagraphStyle); + fixedParagraphStyles.put("heading", new TagAndStyle("h1", null)); + fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null)); + fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title")); + fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle")); + fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null)); } // True if we are currently in the named style tag: @@ -70,6 +78,57 @@ public class WordExtractor extends Abstr private boolean curBold; private boolean curItalic; + public WordExtractor(ParseContext context) { + super(context); + } + + private static int countParagraphs(Range... ranges) { + int count = 0; + for (Range r : ranges) { + if (r != null) { + count += r.numParagraphs(); + } + } + return count; + } + + /** + * Given a style name, return what tag should be used, and + * what style should be applied to it. + */ + public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) { + TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName); + if (tagAndStyle != null) { + return tagAndStyle; + } + + if (styleName.equals("Table Contents") && isTable) { + return defaultParagraphStyle; + } + + String tag = "p"; + String styleClass = null; + + if (styleName.startsWith("heading") || styleName.startsWith("Heading")) { + // "Heading 3" or "Heading2" or "heading 4" + int num = 1; + try { + num = Integer.parseInt( + styleName.substring(styleName.length() - 1) + ); + } catch (NumberFormatException e) { + } + // Turn it into a H1 - H6 (H7+ isn't valid!) + tag = "h" + Math.min(num, 6); + } else { + styleClass = styleName.replace(' ', '_'); + styleClass = styleClass.substring(0, 1).toLowerCase(Locale.ROOT) + + styleClass.substring(1); + } + + return new TagAndStyle(tag, styleClass); + } + protected void parse( NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { @@ -82,12 +141,12 @@ public class WordExtractor extends Abstr HWPFDocument document; try { document = new HWPFDocument(root); - } catch(OldWordFileFormatException e) { + } catch (OldWordFileFormatException e) { parseWord6(root, xhtml); return; } org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = - new org.apache.poi.hwpf.extractor.WordExtractor(document); + new org.apache.poi.hwpf.extractor.WordExtractor(document); HeaderStories headerFooter = new HeaderStories(document); // Grab the list of pictures. As far as we can tell, @@ -97,24 +156,24 @@ public class WordExtractor extends Abstr PicturesSource pictures = new PicturesSource(document); // Do any headers, if present - Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), - headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() }; + Range[] headers = new Range[]{headerFooter.getFirstHeaderSubrange(), + headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange()}; handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml); // Do the main paragraph text Range r = document.getRange(); ListManager listManager = new ListManager(document); - for(int i=0; i<r.numParagraphs(); i++) { - Paragraph p = r.getParagraph(i); - i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml); + for (int i = 0; i < r.numParagraphs(); i++) { + Paragraph p = r.getParagraph(i); + i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml); } // Do everything else - for (String paragraph: wordExtractor.getMainTextboxText()) { + for (String paragraph : wordExtractor.getMainTextboxText()) { xhtml.element("p", paragraph); } - for (String paragraph : wordExtractor.getFootnoteText()) { + for (String paragraph : wordExtractor.getFootnoteText()) { xhtml.element("p", paragraph); } @@ -127,16 +186,16 @@ public class WordExtractor extends Abstr } // Do any footers, if present - Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), - headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() }; + Range[] footers = new Range[]{headerFooter.getFirstFooterSubrange(), + headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange()}; handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml); // Handle any pictures that we haven't output yet - for(Picture p = pictures.nextUnclaimed(); p != null; ) { - handlePictureCharacterRun( - null, p, pictures, xhtml - ); - p = pictures.nextUnclaimed(); + for (Picture p = pictures.nextUnclaimed(); p != null; ) { + handlePictureCharacterRun( + null, p, pictures, xhtml + ); + p = pictures.nextUnclaimed(); } // Handle any embeded office documents @@ -148,32 +207,24 @@ public class WordExtractor extends Abstr handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } } - } catch(FileNotFoundException e) { - } - } - - private static int countParagraphs(Range... ranges) { - int count = 0; - for (Range r : ranges) { - if (r != null) { count += r.numParagraphs(); } + } catch (FileNotFoundException e) { } - return count; } private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document, - PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml) - throws SAXException, IOException, TikaException { + PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml) + throws SAXException, IOException, TikaException { if (countParagraphs(ranges) > 0) { xhtml.startElement("div", "class", type); ListManager listManager = new ListManager(document); for (Range r : ranges) { if (r != null) { - for(int i=0; i<r.numParagraphs(); i++) { + for (int i = 0; i < r.numParagraphs(); i++) { Paragraph p = r.getParagraph(i); i += handleParagraph(p, 0, r, document, FieldsDocumentPart.HEADER, pictures, pictureTable, listManager, xhtml); - } + } } } xhtml.endElement("div"); @@ -181,275 +232,276 @@ public class WordExtractor extends Abstr } private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, - FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, - XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { - // Note - a poi bug means we can't currently properly recurse - // into nested tables, so currently we don't - if(p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel==0) { - Table t = r.getTable(p); - xhtml.startElement("table"); - xhtml.startElement("tbody"); - for(int rn=0; rn<t.numRows(); rn++) { - TableRow row = t.getRow(rn); - xhtml.startElement("tr"); - for(int cn=0; cn<row.numCells(); cn++) { - TableCell cell = row.getCell(cn); - xhtml.startElement("td"); - - for(int pn=0; pn<cell.numParagraphs(); pn++) { - Paragraph cellP = cell.getParagraph(pn); - handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml); - } - xhtml.endElement("td"); - } - xhtml.endElement("tr"); - } - xhtml.endElement("tbody"); - xhtml.endElement("table"); - return (t.numParagraphs()-1); - } + FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, + XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { + // Note - a poi bug means we can't currently properly recurse + // into nested tables, so currently we don't + if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) { + Table t = r.getTable(p); + xhtml.startElement("table"); + xhtml.startElement("tbody"); + for (int rn = 0; rn < t.numRows(); rn++) { + TableRow row = t.getRow(rn); + xhtml.startElement("tr"); + for (int cn = 0; cn < row.numCells(); cn++) { + TableCell cell = row.getCell(cn); + xhtml.startElement("td"); + + for (int pn = 0; pn < cell.numParagraphs(); pn++) { + Paragraph cellP = cell.getParagraph(pn); + handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml); + } + xhtml.endElement("td"); + } + xhtml.endElement("tr"); + } + xhtml.endElement("tbody"); + xhtml.endElement("table"); + return (t.numParagraphs() - 1); + } - String text = p.text(); - if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) { + String text = p.text(); + if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) { // Skip empty paragraphs return 0; - } + } - TagAndStyle tas; - String numbering = null; + TagAndStyle tas; + String numbering = null; - if (document.getStyleSheet().numStyles()>p.getStyleIndex()) { - StyleDescription style = - document.getStyleSheet().getStyleDescription(p.getStyleIndex()); - if (style != null && style.getName() != null && style.getName().length() > 0) { - if (p.isInList()) { - numbering = listManager.getFormattedNumber(p); - } - tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel>0)); - } else { - tas = new TagAndStyle("p", null); - } - } else { - tas = new TagAndStyle("p", null); - } - - if(tas.getStyleClass() != null) { - xhtml.startElement(tas.getTag(), "class", tas.getStyleClass()); - } else { - xhtml.startElement(tas.getTag()); - } - - if (numbering != null) { - xhtml.characters(numbering); - } - - for(int j=0; j<p.numCharacterRuns(); j++) { - CharacterRun cr = p.getCharacterRun(j); - - // FIELD_BEGIN_MARK: - if (cr.text().getBytes(IOUtils.UTF_8)[0] == 0x13) { - Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset()); - // 58 is an embedded document - // 56 is a document link - if (field != null && (field.getType() == 58 || field.getType() == 56)) { - // Embedded Object: add a <div - // class="embedded" id="_X"/> so consumer can see where - // in the main text each embedded document - // occurred: - String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset(); - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "class", "class", "CDATA", "embedded"); - attributes.addAttribute("", "id", "id", "CDATA", id); - xhtml.startElement("div", attributes); - xhtml.endElement("div"); - } - } - - if(cr.text().equals("\u0013")) { - j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml); - } else if(cr.text().startsWith("\u0008")) { - // Floating Picture(s) - for(int pn=0; pn<cr.text().length(); pn++) { - // Assume they're in the order from the unclaimed list... - Picture picture = pictures.nextUnclaimed(); + if (document.getStyleSheet().numStyles() > p.getStyleIndex()) { + StyleDescription style = + document.getStyleSheet().getStyleDescription(p.getStyleIndex()); + if (style != null && style.getName() != null && style.getName().length() > 0) { + if (p.isInList()) { + numbering = listManager.getFormattedNumber(p); + } + tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0)); + } else { + tas = new TagAndStyle("p", null); + } + } else { + tas = new TagAndStyle("p", null); + } - // Output + if (tas.getStyleClass() != null) { + xhtml.startElement(tas.getTag(), "class", tas.getStyleClass()); + } else { + xhtml.startElement(tas.getTag()); + } + + if (numbering != null) { + xhtml.characters(numbering); + } + + for (int j = 0; j < p.numCharacterRuns(); j++) { + CharacterRun cr = p.getCharacterRun(j); + + // FIELD_BEGIN_MARK: + if (cr.text().getBytes(IOUtils.UTF_8)[0] == 0x13) { + Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset()); + // 58 is an embedded document + // 56 is a document link + if (field != null && (field.getType() == 58 || field.getType() == 56)) { + // Embedded Object: add a <div + // class="embedded" id="_X"/> so consumer can see where + // in the main text each embedded document + // occurred: + String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", id); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } + } + + if (cr.text().equals("\u0013")) { + j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml); + } else if (cr.text().startsWith("\u0008")) { + // Floating Picture(s) + for (int pn = 0; pn < cr.text().length(); pn++) { + // Assume they're in the order from the unclaimed list... + Picture picture = pictures.nextUnclaimed(); + + // Output + handlePictureCharacterRun(cr, picture, pictures, xhtml); + } + } else if (pictureTable.hasPicture(cr)) { + // Inline Picture + Picture picture = pictures.getFor(cr); handlePictureCharacterRun(cr, picture, pictures, xhtml); - } - } else if(pictureTable.hasPicture(cr)) { - // Inline Picture - Picture picture = pictures.getFor(cr); - handlePictureCharacterRun(cr, picture, pictures, xhtml); - } else { - handleCharacterRun(cr, tas.isHeading(), xhtml); - } - } - - // Close any still open style tags - if (curStrikeThrough) { - xhtml.endElement("s"); - curStrikeThrough = false; - } - if (curItalic) { - xhtml.endElement("i"); - curItalic = false; - } - if (curBold) { - xhtml.endElement("b"); - curBold = false; - } + } else { + handleCharacterRun(cr, tas.isHeading(), xhtml); + } + } + + // Close any still open style tags + if (curStrikeThrough) { + xhtml.endElement("s"); + curStrikeThrough = false; + } + if (curItalic) { + xhtml.endElement("i"); + curItalic = false; + } + if (curBold) { + xhtml.endElement("b"); + curBold = false; + } - xhtml.endElement(tas.getTag()); + xhtml.endElement(tas.getTag()); - return 0; + return 0; } private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml) - throws SAXException { - // Skip trailing newlines - if(!isRendered(cr) || cr.text().equals("\r")) - return; - - if(!skipStyling) { - if (cr.isBold() != curBold) { - // Enforce nesting -- must close s and i tags - if (curStrikeThrough) { - xhtml.endElement("s"); - curStrikeThrough = false; - } - if (curItalic) { - xhtml.endElement("i"); - curItalic = false; - } - if (cr.isBold()) { - xhtml.startElement("b"); - } else { - xhtml.endElement("b"); - } - curBold = cr.isBold(); - } - - if (cr.isItalic() != curItalic) { - // Enforce nesting -- must close s tag - if (curStrikeThrough) { - xhtml.endElement("s"); - curStrikeThrough = false; - } - if (cr.isItalic()) { - xhtml.startElement("i"); - } else { - xhtml.endElement("i"); - } - curItalic = cr.isItalic(); - } - - if (cr.isStrikeThrough() != curStrikeThrough) { - if (cr.isStrikeThrough()) { - xhtml.startElement("s"); - } else { - xhtml.endElement("s"); - } - curStrikeThrough = cr.isStrikeThrough(); - } - } - - // Clean up the text - String text = cr.text(); - text = text.replace('\r', '\n'); - if(text.endsWith("\u0007")) { - // Strip the table cell end marker - text = text.substring(0, text.length()-1); - } - - // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters: - - // Non-breaking hyphens are returned as char 30 - text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN); - - // Non-required hyphens to zero-width space - text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE); - - // Control characters as line break - text = text.replaceAll("[\u0000-\u001f]", "\n"); - xhtml.characters(text); + throws SAXException { + // Skip trailing newlines + if (!isRendered(cr) || cr.text().equals("\r")) + return; + + if (!skipStyling) { + if (cr.isBold() != curBold) { + // Enforce nesting -- must close s and i tags + if (curStrikeThrough) { + xhtml.endElement("s"); + curStrikeThrough = false; + } + if (curItalic) { + xhtml.endElement("i"); + curItalic = false; + } + if (cr.isBold()) { + xhtml.startElement("b"); + } else { + xhtml.endElement("b"); + } + curBold = cr.isBold(); + } + + if (cr.isItalic() != curItalic) { + // Enforce nesting -- must close s tag + if (curStrikeThrough) { + xhtml.endElement("s"); + curStrikeThrough = false; + } + if (cr.isItalic()) { + xhtml.startElement("i"); + } else { + xhtml.endElement("i"); + } + curItalic = cr.isItalic(); + } + + if (cr.isStrikeThrough() != curStrikeThrough) { + if (cr.isStrikeThrough()) { + xhtml.startElement("s"); + } else { + xhtml.endElement("s"); + } + curStrikeThrough = cr.isStrikeThrough(); + } + } + + // Clean up the text + String text = cr.text(); + text = text.replace('\r', '\n'); + if (text.endsWith("\u0007")) { + // Strip the table cell end marker + text = text.substring(0, text.length() - 1); + } + + // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters: + + // Non-breaking hyphens are returned as char 30 + text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN); + + // Non-required hyphens to zero-width space + text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE); + + // Control characters as line break + text = text.replaceAll("[\u0000-\u001f]", "\n"); + xhtml.characters(text); } + /** * Can be \13..text..\15 or \13..control..\14..text..\15 . * Nesting is allowed */ private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling, - PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException { - List<CharacterRun> controls = new ArrayList<CharacterRun>(); - List<CharacterRun> texts = new ArrayList<CharacterRun>(); - boolean has14 = false; - - // Split it into before and after the 14 - int i; - for(i=index+1; i<p.numCharacterRuns(); i++) { - CharacterRun cr = p.getCharacterRun(i); - if(cr.text().equals("\u0013")) { - // Nested, oh joy... - int increment = handleSpecialCharacterRuns(p, i+1, skipStyling, pictures, xhtml); - i += increment; - } else if(cr.text().equals("\u0014")) { - has14 = true; - } else if(cr.text().equals("\u0015")) { - if(!has14) { - texts = controls; - controls = new ArrayList<CharacterRun>(); - } - break; - } else { - if(has14) { - texts.add(cr); - } else { - controls.add(cr); - } - } - } - - // Do we need to do something special with this? - if(controls.size() > 0) { - String text = controls.get(0).text(); - for(int j=1; j<controls.size(); j++) { - text += controls.get(j).text(); - } - - if((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK")) - && text.indexOf('"') > -1) { - int start = text.indexOf('"') + 1; - int end = findHyperlinkEnd(text, start); - String url = ""; - if (start >= 0 && start < end && end <= text.length()) { - url = text.substring(start, end); - } - - xhtml.startElement("a", "href", url); - for(CharacterRun cr : texts) { - handleCharacterRun(cr, skipStyling, xhtml); - } - xhtml.endElement("a"); - } else { - // Just output the text ones - for(CharacterRun cr : texts) { - if(pictures.hasPicture(cr)) { - Picture picture = pictures.getFor(cr); - handlePictureCharacterRun(cr, picture, pictures, xhtml); + PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException { + List<CharacterRun> controls = new ArrayList<CharacterRun>(); + List<CharacterRun> texts = new ArrayList<CharacterRun>(); + boolean has14 = false; + + // Split it into before and after the 14 + int i; + for (i = index + 1; i < p.numCharacterRuns(); i++) { + CharacterRun cr = p.getCharacterRun(i); + if (cr.text().equals("\u0013")) { + // Nested, oh joy... + int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml); + i += increment; + } else if (cr.text().equals("\u0014")) { + has14 = true; + } else if (cr.text().equals("\u0015")) { + if (!has14) { + texts = controls; + controls = new ArrayList<CharacterRun>(); + } + break; + } else { + if (has14) { + texts.add(cr); } else { - handleCharacterRun(cr, skipStyling, xhtml); + controls.add(cr); } - } - } - } else { - // We only had text - // Output as-is - for(CharacterRun cr : texts) { - handleCharacterRun(cr, skipStyling, xhtml); - } - } + } + } + + // Do we need to do something special with this? + if (controls.size() > 0) { + String text = controls.get(0).text(); + for (int j = 1; j < controls.size(); j++) { + text += controls.get(j).text(); + } + + if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK")) + && text.indexOf('"') > -1) { + int start = text.indexOf('"') + 1; + int end = findHyperlinkEnd(text, start); + String url = ""; + if (start >= 0 && start < end && end <= text.length()) { + url = text.substring(start, end); + } + + xhtml.startElement("a", "href", url); + for (CharacterRun cr : texts) { + handleCharacterRun(cr, skipStyling, xhtml); + } + xhtml.endElement("a"); + } else { + // Just output the text ones + for (CharacterRun cr : texts) { + if (pictures.hasPicture(cr)) { + Picture picture = pictures.getFor(cr); + handlePictureCharacterRun(cr, picture, pictures, xhtml); + } else { + handleCharacterRun(cr, skipStyling, xhtml); + } + } + } + } else { + // We only had text + // Output as-is + for (CharacterRun cr : texts) { + handleCharacterRun(cr, skipStyling, xhtml); + } + } - // Tell them how many to skip over - return i-index; + // Tell them how many to skip over + return i - index; } //temporary work around for TIKA-1512 @@ -478,48 +530,48 @@ public class WordExtractor extends Abstr } private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml) - throws SAXException, IOException, TikaException { - if(!isRendered(cr) || picture == null) { - // Oh dear, we've run out... - // Probably caused by multiple \u0008 images referencing - // the same real image - return; - } - - // Which one is it? - String extension = picture.suggestFileExtension(); - int pictureNumber = pictures.pictureNumber(picture); - - // Make up a name for the picture - // There isn't one in the file, but we need to be able to reference - // the picture from the img tag and the embedded resource - String filename = "image"+pictureNumber+(extension.length()>0 ? "."+extension : ""); - - // Grab the mime type for the picture - String mimeType = picture.getMimeType(); - - // Output the img tag - AttributesImpl attr = new AttributesImpl(); - attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename); - attr.addAttribute("", "alt", "alt", "CDATA", filename); - xhtml.startElement("img", attr); - xhtml.endElement("img"); - - // Have we already output this one? - // (Only expose each individual image once) - if(! pictures.hasOutput(picture)) { - TikaInputStream stream = TikaInputStream.get(picture.getContent()); - handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false); - pictures.recordOutput(picture); - } + throws SAXException, IOException, TikaException { + if (!isRendered(cr) || picture == null) { + // Oh dear, we've run out... + // Probably caused by multiple \u0008 images referencing + // the same real image + return; + } + + // Which one is it? + String extension = picture.suggestFileExtension(); + int pictureNumber = pictures.pictureNumber(picture); + + // Make up a name for the picture + // There isn't one in the file, but we need to be able to reference + // the picture from the img tag and the embedded resource + String filename = "image" + pictureNumber + (extension.length() > 0 ? "." + extension : ""); + + // Grab the mime type for the picture + String mimeType = picture.getMimeType(); + + // Output the img tag + AttributesImpl attr = new AttributesImpl(); + attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename); + attr.addAttribute("", "alt", "alt", "CDATA", filename); + xhtml.startElement("img", attr); + xhtml.endElement("img"); + + // Have we already output this one? + // (Only expose each individual image once) + if (!pictures.hasOutput(picture)) { + TikaInputStream stream = TikaInputStream.get(picture.getContent()); + handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false); + pictures.recordOutput(picture); + } } /** * Outputs a section of text if the given text is non-empty. * - * @param xhtml XHTML content handler + * @param xhtml XHTML content handler * @param section the class of the <div/> section emitted - * @param text text to be emitted, if any + * @param text text to be emitted, if any * @throws SAXException if an error occurs */ private void addTextIfAny( @@ -544,77 +596,11 @@ public class WordExtractor extends Abstr HWPFOldDocument doc = new HWPFOldDocument(root); Word6Extractor extractor = new Word6Extractor(doc); - for(String p : extractor.getParagraphText()) { + for (String p : extractor.getParagraphText()) { xhtml.element("p", p); } } - private static final Map<String,TagAndStyle> fixedParagraphStyles = new HashMap<String,TagAndStyle>(); - private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null); - static { - fixedParagraphStyles.put("Default", defaultParagraphStyle); - fixedParagraphStyles.put("Normal", defaultParagraphStyle); - fixedParagraphStyles.put("heading", new TagAndStyle("h1", null)); - fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null)); - fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title")); - fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle")); - fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null)); - } - - /** - * Given a style name, return what tag should be used, and - * what style should be applied to it. - */ - public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) { - TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName); - if (tagAndStyle != null) { - return tagAndStyle; - } - - if (styleName.equals("Table Contents") && isTable) { - return defaultParagraphStyle; - } - - String tag = "p"; - String styleClass = null; - - if(styleName.startsWith("heading") || styleName.startsWith("Heading")) { - // "Heading 3" or "Heading2" or "heading 4" - int num = 1; - try { - num = Integer.parseInt( - styleName.substring(styleName.length()-1) - ); - } catch(NumberFormatException e) {} - // Turn it into a H1 - H6 (H7+ isn't valid!) - tag = "h" + Math.min(num, 6); - } else { - styleClass = styleName.replace(' ', '_'); - styleClass = styleClass.substring(0,1).toLowerCase(Locale.ROOT) + - styleClass.substring(1); - } - - return new TagAndStyle(tag,styleClass); - } - - public static class TagAndStyle { - private String tag; - private String styleClass; - public TagAndStyle(String tag, String styleClass) { - this.tag = tag; - this.styleClass = styleClass; - } - public String getTag() { - return tag; - } - public String getStyleClass() { - return styleClass; - } - public boolean isHeading() { - return tag.length()==2 && tag.startsWith("h"); - } - } - /** * Determines if character run should be included in the extraction. * @@ -622,81 +608,103 @@ public class WordExtractor extends Abstr * @return true if character run should be included in extraction. */ private boolean isRendered(final CharacterRun cr) { - return cr == null || !cr.isMarkedDeleted(); + return cr == null || !cr.isMarkedDeleted(); } + public static class TagAndStyle { + private String tag; + private String styleClass; + + public TagAndStyle(String tag, String styleClass) { + this.tag = tag; + this.styleClass = styleClass; + } + + public String getTag() { + return tag; + } + + public String getStyleClass() { + return styleClass; + } + + public boolean isHeading() { + return tag.length() == 2 && tag.startsWith("h"); + } + } /** * Provides access to the pictures both by offset, iteration - * over the un-claimed, and peeking forward + * over the un-claimed, and peeking forward */ private static class PicturesSource { - private PicturesTable picturesTable; - private Set<Picture> output = new HashSet<Picture>(); - private Map<Integer,Picture> lookup; - private List<Picture> nonU1based; - private List<Picture> all; - private int pn = 0; - - private PicturesSource(HWPFDocument doc) { - picturesTable = doc.getPicturesTable(); - all = picturesTable.getAllPictures(); - - // Build the Offset-Picture lookup map - lookup = new HashMap<Integer, Picture>(); - for(Picture p : all) { - lookup.put(p.getStartOffset(), p); - } - - // Work out which Pictures aren't referenced by - // a \u0001 in the main text - // These are \u0008 escher floating ones, ones - // found outside the normal text, and who - // knows what else... - nonU1based = new ArrayList<Picture>(); - nonU1based.addAll(all); - Range r = doc.getRange(); - for(int i=0; i<r.numCharacterRuns(); i++) { - CharacterRun cr = r.getCharacterRun(i); - if(picturesTable.hasPicture(cr)) { - Picture p = getFor(cr); - int at = nonU1based.indexOf(p); - nonU1based.set(at, null); - } - } - } - - private boolean hasPicture(CharacterRun cr) { - return picturesTable.hasPicture(cr); - } - - private void recordOutput(Picture picture) { - output.add(picture); - } - private boolean hasOutput(Picture picture) { - return output.contains(picture); - } - - private int pictureNumber(Picture picture) { - return all.indexOf(picture) + 1; - } - - private Picture getFor(CharacterRun cr) { - return lookup.get(cr.getPicOffset()); - } - - /** - * Return the next unclaimed one, used towards - * the end - */ - private Picture nextUnclaimed() { - Picture p = null; - while(pn < nonU1based.size()) { - p = nonU1based.get(pn); - pn++; - if(p != null) return p; - } - return null; - } + private PicturesTable picturesTable; + private Set<Picture> output = new HashSet<Picture>(); + private Map<Integer, Picture> lookup; + private List<Picture> nonU1based; + private List<Picture> all; + private int pn = 0; + + private PicturesSource(HWPFDocument doc) { + picturesTable = doc.getPicturesTable(); + all = picturesTable.getAllPictures(); + + // Build the Offset-Picture lookup map + lookup = new HashMap<Integer, Picture>(); + for (Picture p : all) { + lookup.put(p.getStartOffset(), p); + } + + // Work out which Pictures aren't referenced by + // a \u0001 in the main text + // These are \u0008 escher floating ones, ones + // found outside the normal text, and who + // knows what else... + nonU1based = new ArrayList<Picture>(); + nonU1based.addAll(all); + Range r = doc.getRange(); + for (int i = 0; i < r.numCharacterRuns(); i++) { + CharacterRun cr = r.getCharacterRun(i); + if (picturesTable.hasPicture(cr)) { + Picture p = getFor(cr); + int at = nonU1based.indexOf(p); + nonU1based.set(at, null); + } + } + } + + private boolean hasPicture(CharacterRun cr) { + return picturesTable.hasPicture(cr); + } + + private void recordOutput(Picture picture) { + output.add(picture); + } + + private boolean hasOutput(Picture picture) { + return output.contains(picture); + } + + private int pictureNumber(Picture picture) { + return all.indexOf(picture) + 1; + } + + private Picture getFor(CharacterRun cr) { + return lookup.get(cr.getPicOffset()); + } + + /** + * Return the next unclaimed one, used towards + * the end + */ + private Picture nextUnclaimed() { + Picture p = null; + while (pn < nonU1based.size()) { + p = nonU1based.get(pn); + pn++; + if (p != null) return p; + } + return null; + } } }
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Fri May 29 14:36:21 2015 @@ -53,7 +53,7 @@ import org.xml.sax.helpers.AttributesImp /** * Base class for all Tika OOXML extractors. - * + * <p/> * Tika extractors decorate POI extractors so that the parsed content of * documents is returned as a sequence of XHTML SAX events. Subclasses must * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that @@ -67,17 +67,15 @@ public abstract class AbstractOOXMLExtra private static final String TYPE_OLE_OBJECT = "application/vnd.openxmlformats-officedocument.oleObject"; - - protected POIXMLTextExtractor extractor; - private final EmbeddedDocumentExtractor embeddedExtractor; + protected POIXMLTextExtractor extractor; public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) { this.extractor = extractor; EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); - if (ex==null) { + if (ex == null) { embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context); } else { embeddedExtractor = ex; @@ -101,7 +99,7 @@ public abstract class AbstractOOXMLExtra /** * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler, - * org.apache.tika.metadata.Metadata) + * org.apache.tika.metadata.Metadata) */ public void getXHTML( ContentHandler handler, Metadata metadata, ParseContext context) @@ -113,55 +111,55 @@ public abstract class AbstractOOXMLExtra // Now do any embedded parts handleEmbeddedParts(handler); - + // thumbnail handleThumbnail(handler); xhtml.endDocument(); } - + protected String getJustFileName(String desc) { - int idx = desc.lastIndexOf('/'); - if (idx != -1) { - desc = desc.substring(idx+1); - } - idx = desc.lastIndexOf('.'); - if (idx != -1) { - desc = desc.substring(0, idx); - } + int idx = desc.lastIndexOf('/'); + if (idx != -1) { + desc = desc.substring(idx + 1); + } + idx = desc.lastIndexOf('.'); + if (idx != -1) { + desc = desc.substring(0, idx); + } - return desc; + return desc; } - - private void handleThumbnail( ContentHandler handler ) { + + private void handleThumbnail(ContentHandler handler) { try { OPCPackage opcPackage = extractor.getPackage(); - for (PackageRelationship rel : opcPackage.getRelationshipsByType( PackageRelationshipTypes.THUMBNAIL )) { + for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) { PackagePart tPart = opcPackage.getPart(rel); InputStream tStream = tPart.getInputStream(); - Metadata thumbnailMetadata = new Metadata(); + Metadata thumbnailMetadata = new Metadata(); String thumbName = tPart.getPartName().getName(); thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName); - + AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded"); attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName); handler.startElement(XHTML, "div", "div", attributes); handler.endElement(XHTML, "div", "div"); - + thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, thumbName); thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType()); thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName()); - + if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) { embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false); } - + tStream.close(); } - } catch (Exception ex) { - - } + } catch (Exception ex) { + + } } private void handleEmbeddedParts(ContentHandler handler) @@ -175,9 +173,9 @@ public abstract class AbstractOOXMLExtra if (sourceURI != null) { sourceDesc = getJustFileName(sourceURI.getPath()); if (sourceDesc.startsWith("slide")) { - sourceDesc += "_"; + sourceDesc += "_"; } else { - sourceDesc = ""; + sourceDesc = ""; } } else { sourceDesc = ""; @@ -215,11 +213,11 @@ public abstract class AbstractOOXMLExtra private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel) throws IOException, SAXException { // A POIFSFileSystem needs to be at least 3 blocks big to be valid - if (part.getSize() >= 0 && part.getSize() < 512*3) { - // Too small, skip - return; + if (part.getSize() >= 0 && part.getSize() < 512 * 3) { + // Too small, skip + return; } - + // Open the POIFS (OLE2) structure and process POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream()); try { @@ -229,19 +227,19 @@ public abstract class AbstractOOXMLExtra DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); - + if (root.hasEntry("CONTENTS") - && root.hasEntry("\u0001Ole") - && root.hasEntry("\u0001CompObj") - && root.hasEntry("\u0003ObjInfo")) { - // TIKA-704: OLE 2.0 embedded non-Office document? - stream = TikaInputStream.get( - fs.createDocumentInputStream("CONTENTS")); - if (embeddedExtractor.shouldParseEmbedded(metadata)) { - embeddedExtractor.parseEmbedded( - stream, new EmbeddedContentHandler(handler), - metadata, false); - } + && root.hasEntry("\u0001Ole") + && root.hasEntry("\u0001CompObj") + && root.hasEntry("\u0003ObjInfo")) { + // TIKA-704: OLE 2.0 embedded non-Office document? + stream = TikaInputStream.get( + fs.createDocumentInputStream("CONTENTS")); + if (embeddedExtractor.shouldParseEmbedded(metadata)) { + embeddedExtractor.parseEmbedded( + stream, new EmbeddedContentHandler(handler), + metadata, false); + } } else if (POIFSDocumentType.OLE10_NATIVE == type) { // TIKA-704: OLE 1.0 embedded document Ole10Native ole = @@ -302,12 +300,12 @@ public abstract class AbstractOOXMLExtra */ protected abstract void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException; - + /** * Return a list of the main parts of the document, used - * when searching for embedded resources. + * when searching for embedded resources. * This should be all the parts of the document that end - * up with things embedded into them. + * up with things embedded into them. */ protected abstract List<PackagePart> getMainDocumentParts() throws TikaException; Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java Fri May 29 14:36:21 2015 @@ -19,10 +19,10 @@ package org.apache.tika.parser.microsoft import java.math.BigDecimal; import java.util.Date; -import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLProperties.CoreProperties; import org.apache.poi.POIXMLProperties.CustomProperties; import org.apache.poi.POIXMLProperties.ExtendedProperties; +import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart; import org.apache.poi.openxml4j.util.Nullable; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; @@ -40,9 +40,9 @@ import org.openxmlformats.schemas.office /** * OOXML metadata extractor. - * + * <p/> * Currently POI doesn't support metadata extraction for OOXML. - * + * * @see OOXMLExtractor#getMetadataExtractor() */ public class MetadataExtractor { @@ -55,8 +55,8 @@ public class MetadataExtractor { public void extract(Metadata metadata) throws TikaException { if (extractor.getDocument() != null || - (extractor instanceof XSSFEventBasedExcelExtractor && - extractor.getPackage() != null)) { + (extractor instanceof XSSFEventBasedExcelExtractor && + extractor.getPackage() != null)) { extractMetadata(extractor.getCoreProperties(), metadata); extractMetadata(extractor.getExtendedProperties(), metadata); extractMetadata(extractor.getCustomProperties(), metadata); @@ -89,15 +89,15 @@ public class MetadataExtractor { addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder .getModifiedProperty()); addProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder - .getModifiedProperty()); + .getModifiedProperty()); addProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder .getRevisionProperty()); // TODO: Move to OO subject in Tika 2.0 - addProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, + addProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, propsHolder.getSubjectProperty()); addProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty()); addProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty()); - + // Legacy Tika-1.0 style stats // TODO Remove these in Tika 2.0 addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty()); @@ -109,7 +109,7 @@ public class MetadataExtractor { } private void extractMetadata(ExtendedProperties properties, - Metadata metadata) { + Metadata metadata) { CTProperties propsHolder = properties.getUnderlyingProperties(); addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication()); @@ -123,9 +123,9 @@ public class MetadataExtractor { addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, propsHolder.getTotalTime()); if (propsHolder.getPages() > 0) { - metadata.set(PagedText.N_PAGES, propsHolder.getPages()); + metadata.set(PagedText.N_PAGES, propsHolder.getPages()); } else if (propsHolder.getSlides() > 0) { - metadata.set(PagedText.N_PAGES, propsHolder.getSlides()); + metadata.set(PagedText.N_PAGES, propsHolder.getSlides()); } // Process the document statistics @@ -136,7 +136,7 @@ public class MetadataExtractor { addProperty(metadata, Office.WORD_COUNT, propsHolder.getWords()); addProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters()); addProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces()); - + // Legacy Tika-1.0 style stats // TODO Remove these in Tika 2.0 addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication()); @@ -156,113 +156,89 @@ public class MetadataExtractor { } private void extractMetadata(CustomProperties properties, - Metadata metadata) { - org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties - props = properties.getUnderlyingProperties(); - for (int i = 0; i < props.sizeOfPropertyArray(); i++) { - CTProperty property = props.getPropertyArray(i); - String val = null; - Date date = null; - - if (property.isSetLpwstr()) { - val = property.getLpwstr(); - } - else if (property.isSetLpstr()) { - val = property.getLpstr(); - } - else if (property.isSetDate()) { - date = property.getDate().getTime(); - } - else if (property.isSetFiletime()) { - date = property.getFiletime().getTime(); - } - - else if (property.isSetBool()) { - val = Boolean.toString( property.getBool() ); - } - - // Integers - else if (property.isSetI1()) { - val = Integer.toString(property.getI1()); - } - else if (property.isSetI2()) { - val = Integer.toString(property.getI2()); - } - else if (property.isSetI4()) { - val = Integer.toString(property.getI4()); - } - else if (property.isSetI8()) { - val = Long.toString(property.getI8()); - } - else if (property.isSetInt()) { - val = Integer.toString( property.getInt() ); - } - - // Unsigned Integers - else if (property.isSetUi1()) { - val = Integer.toString(property.getUi1()); - } - else if (property.isSetUi2()) { - val = Integer.toString(property.getUi2()); - } - else if (property.isSetUi4()) { - val = Long.toString(property.getUi4()); - } - else if (property.isSetUi8()) { - val = property.getUi8().toString(); - } - else if (property.isSetUint()) { - val = Long.toString(property.getUint()); - } - - // Reals - else if (property.isSetR4()) { - val = Float.toString( property.getR4() ); - } - else if (property.isSetR8()) { - val = Double.toString( property.getR8() ); - } - else if (property.isSetDecimal()) { - BigDecimal d = property.getDecimal(); - if (d == null) { - val = null; - } else { - val = d.toPlainString(); - } - } - - else if (property.isSetArray()) { - // TODO Fetch the array values and output - } - else if (property.isSetVector()) { - // TODO Fetch the vector values and output - } - - else if (property.isSetBlob() || property.isSetOblob()) { - // TODO Decode, if possible - } - else if (property.isSetStream() || property.isSetOstream() || - property.isSetVstream()) { - // TODO Decode, if possible - } - else if (property.isSetStorage() || property.isSetOstorage()) { - // TODO Decode, if possible - } - - else { - // This type isn't currently supported yet, skip the property - } - - String propName = "custom:" + property.getName(); - if (date != null) { - Property tikaProp = Property.externalDate(propName); - metadata.set(tikaProp, date); - } else if (val != null) { - metadata.set(propName, val); - } - } + Metadata metadata) { + org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties + props = properties.getUnderlyingProperties(); + for (int i = 0; i < props.sizeOfPropertyArray(); i++) { + CTProperty property = props.getPropertyArray(i); + String val = null; + Date date = null; + + if (property.isSetLpwstr()) { + val = property.getLpwstr(); + } else if (property.isSetLpstr()) { + val = property.getLpstr(); + } else if (property.isSetDate()) { + date = property.getDate().getTime(); + } else if (property.isSetFiletime()) { + date = property.getFiletime().getTime(); + } else if (property.isSetBool()) { + val = Boolean.toString(property.getBool()); + } + + // Integers + else if (property.isSetI1()) { + val = Integer.toString(property.getI1()); + } else if (property.isSetI2()) { + val = Integer.toString(property.getI2()); + } else if (property.isSetI4()) { + val = Integer.toString(property.getI4()); + } else if (property.isSetI8()) { + val = Long.toString(property.getI8()); + } else if (property.isSetInt()) { + val = Integer.toString(property.getInt()); + } + + // Unsigned Integers + else if (property.isSetUi1()) { + val = Integer.toString(property.getUi1()); + } else if (property.isSetUi2()) { + val = Integer.toString(property.getUi2()); + } else if (property.isSetUi4()) { + val = Long.toString(property.getUi4()); + } else if (property.isSetUi8()) { + val = property.getUi8().toString(); + } else if (property.isSetUint()) { + val = Long.toString(property.getUint()); + } + + // Reals + else if (property.isSetR4()) { + val = Float.toString(property.getR4()); + } else if (property.isSetR8()) { + val = Double.toString(property.getR8()); + } else if (property.isSetDecimal()) { + BigDecimal d = property.getDecimal(); + if (d == null) { + val = null; + } else { + val = d.toPlainString(); + } + } else if (property.isSetArray()) { + // TODO Fetch the array values and output + } else if (property.isSetVector()) { + // TODO Fetch the vector values and output + } else if (property.isSetBlob() || property.isSetOblob()) { + // TODO Decode, if possible + } else if (property.isSetStream() || property.isSetOstream() || + property.isSetVstream()) { + // TODO Decode, if possible + } else if (property.isSetStorage() || property.isSetOstorage()) { + // TODO Decode, if possible + } else { + // This type isn't currently supported yet, skip the property + } + + String propName = "custom:" + property.getName(); + if (date != null) { + Property tikaProp = Property.externalDate(propName); + metadata.set(tikaProp, date); + } else if (val != null) { + metadata.set(propName, val); + } + } } - + private <T> void addProperty(Metadata metadata, Property property, Nullable<T> nullableValue) { T value = nullableValue.getValue(); if (value != null) { @@ -283,7 +259,7 @@ public class MetadataExtractor { addProperty(metadata, name, value.getValue().toString()); } } - + private void addProperty(Metadata metadata, Property property, String value) { if (value != null) { metadata.set(property, value); @@ -297,11 +273,11 @@ public class MetadataExtractor { } private void addProperty(Metadata metadata, Property property, int value) { - if (value > 0) { - metadata.set(property, value); - } + if (value > 0) { + metadata.set(property, value); + } } - + private void addProperty(Metadata metadata, String name, int value) { if (value > 0) { metadata.set(name, Integer.toString(value)); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java Fri May 29 14:36:21 2015 @@ -29,14 +29,14 @@ import org.xml.sax.SAXException; /** * Interface implemented by all Tika OOXML extractors. - * + * * @see org.apache.poi.POIXMLTextExtractor */ public interface OOXMLExtractor { /** * Returns the opened document. - * + * * @see POIXMLTextExtractor#getDocument() */ POIXMLDocument getDocument(); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Fri May 29 14:36:21 2015 @@ -56,7 +56,7 @@ public class OOXMLExtractorFactory { throws IOException, SAXException, TikaException { Locale locale = context.get(Locale.class, Locale.getDefault()); ExtractorFactory.setThreadPrefersEventExtractors(true); - + try { OOXMLExtractor extractor; OPCPackage pkg; @@ -66,34 +66,34 @@ public class OOXMLExtractorFactory { if (tis != null && tis.getOpenContainer() instanceof OPCPackage) { pkg = (OPCPackage) tis.getOpenContainer(); } else if (tis != null && tis.hasFile()) { - pkg = OPCPackage.open( tis.getFile().getPath(), PackageAccess.READ ); + pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ); tis.setOpenContainer(pkg); } else { InputStream shield = new CloseShieldInputStream(stream); - pkg = OPCPackage.open(shield); + pkg = OPCPackage.open(shield); } - + // Get the type, and ensure it's one we handle MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg); if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) { - // Not a supported type, delegate to Empty Parser - EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context); - return; + // Not a supported type, delegate to Empty Parser + EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context); + return; } metadata.set(Metadata.CONTENT_TYPE, type.toString()); // Have the appropriate OOXML text extractor picked POIXMLTextExtractor poiExtractor = ExtractorFactory.createExtractor(pkg); - + POIXMLDocument document = poiExtractor.getDocument(); if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { - extractor = new XSSFExcelExtractorDecorator( - context, (XSSFEventBasedExcelExtractor)poiExtractor, locale); + extractor = new XSSFExcelExtractorDecorator( + context, (XSSFEventBasedExcelExtractor) poiExtractor, locale); } else if (document == null) { - throw new TikaException( - "Expecting UserModel based POI OOXML extractor with a document, but none found. " + - "The extractor returned was a " + poiExtractor - ); + throw new TikaException( + "Expecting UserModel based POI OOXML extractor with a document, but none found. " + + "The extractor returned was a " + poiExtractor + ); } else if (document instanceof XMLSlideShow) { extractor = new XSLFPowerPointExtractorDecorator( context, (XSLFPowerPointExtractor) poiExtractor); @@ -103,11 +103,11 @@ public class OOXMLExtractorFactory { } else { extractor = new POIXMLTextExtractorDecorator(context, poiExtractor); } - + // Get the bulk of the metadata first, so that it's accessible during // parsing if desired by the client (see TIKA-1109) extractor.getMetadataExtractor().extract(metadata); - + // Extract the text, along with any in-document metadata extractor.getXHTML(baseHandler, metadata, context); } catch (IllegalArgumentException e) { @@ -115,7 +115,7 @@ public class OOXMLExtractorFactory { e.getMessage().startsWith("No supported documents found")) { throw new TikaException( "TIKA-418: RuntimeException while getting content" - + " for thmx and xps file types", e); + + " for thmx and xps file types", e); } else { throw new TikaException("Error creating OOXML extractor", e); } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Fri May 29 14:36:21 2015 @@ -36,39 +36,39 @@ import org.xml.sax.SAXException; */ public class OOXMLParser extends AbstractParser { - /** Serial version UID */ - private static final long serialVersionUID = 6535995710857776481L; - protected static final Set<MediaType> SUPPORTED_TYPES = - Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( - MediaType.application("x-tika-ooxml"), - MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"), - MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"), - MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"), - MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"), - MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"), - MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"), - MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), - MediaType.application("vnd.ms-excel.sheet.macroenabled.12"), - MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"), - MediaType.application("vnd.ms-excel.template.macroenabled.12"), - MediaType.application("vnd.ms-excel.addin.macroenabled.12"), - MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"), - MediaType.application("vnd.ms-word.document.macroenabled.12"), - MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"), - MediaType.application("vnd.ms-word.template.macroenabled.12")))); - + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("x-tika-ooxml"), + MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"), + MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"), + MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"), + MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"), + MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), + MediaType.application("vnd.ms-excel.sheet.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"), + MediaType.application("vnd.ms-excel.template.macroenabled.12"), + MediaType.application("vnd.ms-excel.addin.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"), + MediaType.application("vnd.ms-word.document.macroenabled.12"), + MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"), + MediaType.application("vnd.ms-word.template.macroenabled.12")))); /** * We claim to support all OOXML files, but we actually don't support a small - * number of them. + * number of them. * This list is used to decline certain formats that are not yet supported - * by Tika and/or POI. + * by Tika and/or POI. + */ + protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = + Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"), + MediaType.application("vnd.ms-xpsdocument") + ))); + /** + * Serial version UID */ - protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = - Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( - MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"), - MediaType.application("vnd.ms-xpsdocument") - ))); + private static final long serialVersionUID = 6535995710857776481L; public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java Fri May 29 14:36:21 2015 @@ -39,6 +39,6 @@ public class POIXMLTextExtractorDecorato @Override protected List<PackagePart> getMainDocumentParts() { - return new ArrayList<PackagePart>(); + return new ArrayList<PackagePart>(); } } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1682489&r1=1682488&r2=1682489&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Fri May 29 14:36:21 2015 @@ -70,10 +70,10 @@ public class XSLFPowerPointExtractorDeco for (XSLFSlide slide : slides) { String slideDesc; if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) { - slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString()); - slideDesc += "_"; + slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString()); + slideDesc += "_"; } else { - slideDesc = null; + slideDesc = null; } // slide @@ -118,27 +118,27 @@ public class XSLFPowerPointExtractorDeco continue; } xhtml.element("p", txt.getText()); - } else if (sh instanceof XSLFGroupShape){ + } else if (sh instanceof XSLFGroupShape) { // recurse into groups of shapes - XSLFGroupShape group = (XSLFGroupShape)sh; + XSLFGroupShape group = (XSLFGroupShape) sh; extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc); } else if (sh instanceof XSLFTable) { - XSLFTable tbl = (XSLFTable)sh; - for(XSLFTableRow row : tbl){ + XSLFTable tbl = (XSLFTable) sh; + for (XSLFTableRow row : tbl) { List<XSLFTableCell> cells = row.getCells(); extractContent(cells.toArray(new XSLFTableCell[cells.size()]), skipPlaceholders, xhtml, slideDesc); } } else if (sh instanceof XSLFGraphicFrame) { XSLFGraphicFrame frame = (XSLFGraphicFrame) sh; XmlObject[] sp = frame.getXmlObject().selectPath( - "declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj"); + "declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj"); if (sp != null) { - for(XmlObject emb : sp) { + for (XmlObject emb : sp) { XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id")); if (relIDAtt != null) { String relID = relIDAtt.getDomNode().getNodeValue(); if (slideDesc != null) { - relID = slideDesc + relID; + relID = slideDesc + relID; } AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); @@ -155,7 +155,7 @@ public class XSLFPowerPointExtractorDeco String relID = ctPic.getBlipFill().getBlip().getEmbed(); if (relID != null) { if (slideDesc != null) { - relID = slideDesc + relID; + relID = slideDesc + relID; } AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); @@ -168,50 +168,50 @@ public class XSLFPowerPointExtractorDeco } } } - + /** * In PowerPoint files, slides have things embedded in them, - * and slide drawings which have the images + * and slide drawings which have the images */ @Override protected List<PackagePart> getMainDocumentParts() throws TikaException { - List<PackagePart> parts = new ArrayList<PackagePart>(); - XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument(); - XSLFSlideShow document = null; - try { - document = slideShow._getXSLFSlideShow(); // TODO Avoid this in future - } catch(Exception e) { - throw new TikaException(e.getMessage()); // Shouldn't happen - } - - CTSlideIdList ctSlideIdList = document.getSlideReferences(); - if (ctSlideIdList != null) { - for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) { - CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i); - // Add the slide - PackagePart slidePart; - try { - slidePart = document.getSlidePart(ctSlide); - } catch (IOException e) { - throw new TikaException("Broken OOXML file", e); - } catch (XmlException xe) { - throw new TikaException("Broken OOXML file", xe); - } - parts.add(slidePart); - - // If it has drawings, return those too - try { - for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) { - if (rel.getTargetMode() == TargetMode.INTERNAL) { - PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); - parts.add(rel.getPackage().getPart(relName)); - } - } - } catch (InvalidFormatException e) { - throw new TikaException("Broken OOXML file", e); - } - } - } - return parts; + List<PackagePart> parts = new ArrayList<PackagePart>(); + XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument(); + XSLFSlideShow document = null; + try { + document = slideShow._getXSLFSlideShow(); // TODO Avoid this in future + } catch (Exception e) { + throw new TikaException(e.getMessage()); // Shouldn't happen + } + + CTSlideIdList ctSlideIdList = document.getSlideReferences(); + if (ctSlideIdList != null) { + for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) { + CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i); + // Add the slide + PackagePart slidePart; + try { + slidePart = document.getSlidePart(ctSlide); + } catch (IOException e) { + throw new TikaException("Broken OOXML file", e); + } catch (XmlException xe) { + throw new TikaException("Broken OOXML file", xe); + } + parts.add(slidePart); + + // If it has drawings, return those too + try { + for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + } catch (InvalidFormatException e) { + throw new TikaException("Broken OOXML file", e); + } + } + } + return parts; } }
