Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import javax.xml.namespace.QName; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackagePartName; +import org.apache.poi.openxml4j.opc.PackageRelationship; +import org.apache.poi.openxml4j.opc.PackagingURIHelper; +import org.apache.poi.openxml4j.opc.TargetMode; +import org.apache.poi.xslf.XSLFSlideShow; +import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; +import org.apache.poi.xslf.usermodel.Placeholder; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFCommentAuthors; +import org.apache.poi.xslf.usermodel.XSLFComments; +import org.apache.poi.xslf.usermodel.XSLFGraphicFrame; +import org.apache.poi.xslf.usermodel.XSLFGroupShape; +import org.apache.poi.xslf.usermodel.XSLFNotes; +import org.apache.poi.xslf.usermodel.XSLFNotesMaster; +import org.apache.poi.xslf.usermodel.XSLFPictureShape; +import org.apache.poi.xslf.usermodel.XSLFRelation; +import org.apache.poi.xslf.usermodel.XSLFShape; +import org.apache.poi.xslf.usermodel.XSLFSheet; +import org.apache.poi.xslf.usermodel.XSLFSlide; +import org.apache.poi.xslf.usermodel.XSLFSlideLayout; +import org.apache.poi.xslf.usermodel.XSLFTable; +import org.apache.poi.xslf.usermodel.XSLFTableCell; +import org.apache.poi.xslf.usermodel.XSLFTableRow; +import org.apache.poi.xslf.usermodel.XSLFTextParagraph; +import org.apache.poi.xslf.usermodel.XSLFTextShape; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.xmlbeans.XmlException; +import org.apache.xmlbeans.XmlObject; +import org.openxmlformats.schemas.presentationml.x2006.main.CTComment; +import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor; +import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList; +import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { + public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) { + super(context, extractor); + } + + /** + * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText() + */ + protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException { + XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument(); + XSLFCommentAuthors commentAuthors = slideShow.getCommentAuthors(); + + List<XSLFSlide> slides = slideShow.getSlides(); + for (XSLFSlide slide : slides) { + String slideDesc; + if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) { + slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString()); + slideDesc += "_"; + } else { + slideDesc = null; + } + + // slide content + xhtml.startElement("div", "class", "slide-content"); + extractContent(slide.getShapes(), false, xhtml, slideDesc); + xhtml.endElement("div"); + + // slide layout which is the master sheet for this slide + xhtml.startElement("div", "class", "slide-master-content"); + XSLFSlideLayout slideLayout = slide.getMasterSheet(); + extractContent(slideLayout.getShapes(), true, xhtml, null); + xhtml.endElement("div"); + + // slide master which is the master sheet for all text layouts + XSLFSheet slideMaster = slideLayout.getMasterSheet(); + extractContent(slideMaster.getShapes(), true, xhtml, null); + + // notes (if present) + XSLFNotes slideNotes = slide.getNotes(); + if (slideNotes != null) { + xhtml.startElement("div", "class", "slide-notes"); + + extractContent(slideNotes.getShapes(), false, xhtml, slideDesc); + + // master sheet for this notes + XSLFNotesMaster notesMaster = slideNotes.getMasterSheet(); + extractContent(notesMaster.getShapes(), true, xhtml, null); + xhtml.endElement("div"); + } + + // comments (if present) + XSLFComments comments = slide.getComments(); + if (comments != null) { + StringBuilder authorStringBuilder = new StringBuilder(); + for (int i = 0; i < comments.getNumberOfComments(); i++) { + authorStringBuilder.setLength(0); + CTComment comment = comments.getCommentAt(i); + xhtml.startElement("p", "class", "slide-comment"); + CTCommentAuthor cta = commentAuthors.getAuthorById(comment.getAuthorId()); + if (cta != null) { + if (cta.getName() != null) { + authorStringBuilder.append(cta.getName()); + } + if (cta.getInitials() != null) { + if (authorStringBuilder.length() > 0) { + authorStringBuilder.append(" "); + } + authorStringBuilder.append("("+cta.getInitials()+")"); + } + if (comment.getText() != null && authorStringBuilder.length() > 0) { + authorStringBuilder.append(" - "); + } + if (authorStringBuilder.length() > 0) { + xhtml.startElement("b"); + xhtml.characters(authorStringBuilder.toString()); + xhtml.endElement("b"); + } + } + xhtml.characters(comment.getText()); + xhtml.endElement("p"); + } + } + } + } + + private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc) + throws SAXException { + for (XSLFShape sh : shapes) { + if (sh instanceof XSLFTextShape) { + XSLFTextShape txt = (XSLFTextShape) sh; + Placeholder ph = txt.getTextType(); + if (skipPlaceholders && ph != null) { + continue; + } + for (XSLFTextParagraph p : txt.getTextParagraphs()) { + xhtml.element("p", p.getText()); + } + } else if (sh instanceof XSLFGroupShape) { + // recurse into groups of shapes + XSLFGroupShape group = (XSLFGroupShape) sh; + extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc); + } else if (sh instanceof XSLFTable) { + //unlike tables in Word, ppt/x can't have recursive tables...I don't think + extractTable((XSLFTable)sh, xhtml); + } else if (sh instanceof XSLFGraphicFrame) { + XSLFGraphicFrame frame = (XSLFGraphicFrame) sh; + XmlObject[] sp = frame.getXmlObject().selectPath( + "declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj"); + if (sp != null) { + for (XmlObject emb : sp) { + XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id")); + if (relIDAtt != null) { + String relID = relIDAtt.getDomNode().getNodeValue(); + if (slideDesc != null) { + relID = slideDesc + relID; + } + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", relID); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } + } + } + } else if (sh instanceof XSLFPictureShape) { + if (!skipPlaceholders && (sh.getXmlObject() instanceof CTPicture)) { + CTPicture ctPic = ((CTPicture) sh.getXmlObject()); + if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) { + String relID = ctPic.getBlipFill().getBlip().getEmbed(); + if (relID != null) { + if (slideDesc != null) { + relID = slideDesc + relID; + } + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", relID); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } + } + } + } + } + } + + private void extractTable(XSLFTable tbl, XHTMLContentHandler xhtml) throws SAXException { + xhtml.startElement("table"); + for (XSLFTableRow row : tbl) { + xhtml.startElement("tr"); + List<XSLFTableCell> cells = row.getCells(); + for (XSLFTableCell c : row.getCells()) { + xhtml.startElement("td"); + xhtml.characters(c.getText()); + xhtml.endElement("td"); + } + xhtml.endElement("tr"); + } + xhtml.endElement("table"); + + } + + /** + * In PowerPoint files, slides have things embedded in them, + * and slide drawings which have the images + */ + @Override + protected List<PackagePart> getMainDocumentParts() throws TikaException { + List<PackagePart> parts = new ArrayList<>(); + XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument(); + XSLFSlideShow document = null; + try { + document = slideShow._getXSLFSlideShow(); // TODO Avoid this in future + } catch (Exception e) { + throw new TikaException(e.getMessage()); // Shouldn't happen + } + + CTSlideIdList ctSlideIdList = document.getSlideReferences(); + if (ctSlideIdList != null) { + for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) { + CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i); + // Add the slide + PackagePart slidePart; + try { + slidePart = document.getSlidePart(ctSlide); + } catch (IOException e) { + throw new TikaException("Broken OOXML file", e); + } catch (XmlException xe) { + throw new TikaException("Broken OOXML file", xe); + } + parts.add(slidePart); + + // If it has drawings, return those too + try { + for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + } catch (InvalidFormatException e) { + throw new TikaException("Broken OOXML file", e); + } + } + } + return parts; + } +}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,395 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; +import org.apache.poi.openxml4j.exceptions.OpenXML4JException; +import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.openxml4j.opc.PackagePartName; +import org.apache.poi.openxml4j.opc.PackageRelationship; +import org.apache.poi.openxml4j.opc.PackagingURIHelper; +import org.apache.poi.openxml4j.opc.TargetMode; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.usermodel.HeaderFooter; +import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; +import org.apache.poi.xssf.eventusermodel.XSSFReader; +import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; +import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; +import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; +import org.apache.poi.xssf.model.CommentsTable; +import org.apache.poi.xssf.model.StylesTable; +import org.apache.poi.xssf.usermodel.XSSFComment; +import org.apache.poi.xssf.usermodel.XSSFRelation; +import org.apache.poi.xssf.usermodel.XSSFShape; +import org.apache.poi.xssf.usermodel.XSSFSimpleShape; +import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.xmlbeans.XmlException; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { + /** + * Allows access to headers/footers from raw xml strings + */ + private static HeaderFooterHelper hfHelper = new HeaderFooterHelper(); + private final XSSFEventBasedExcelExtractor extractor; + private final DataFormatter formatter; + private final List<PackagePart> sheetParts = new ArrayList<PackagePart>(); + private Metadata metadata; + + public XSSFExcelExtractorDecorator( + ParseContext context, XSSFEventBasedExcelExtractor extractor, Locale locale) { + super(context, extractor); + + this.extractor = extractor; + extractor.setFormulasNotResults(false); + extractor.setLocale(locale); + + if (locale == null) { + formatter = new DataFormatter(); + } else { + formatter = new DataFormatter(locale); + } + } + + @Override + public void getXHTML( + ContentHandler handler, Metadata metadata, ParseContext context) + throws SAXException, XmlException, IOException, TikaException { + + this.metadata = metadata; + metadata.set(TikaMetadataKeys.PROTECTED, "false"); + + super.getXHTML(handler, metadata, context); + } + + /** + * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText() + */ + @Override + protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, + XmlException, IOException { + OPCPackage container = extractor.getPackage(); + + ReadOnlySharedStringsTable strings; + XSSFReader.SheetIterator iter; + XSSFReader xssfReader; + StylesTable styles; + try { + xssfReader = new XSSFReader(container); + styles = xssfReader.getStylesTable(); + iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); + strings = new ReadOnlySharedStringsTable(container); + } catch (InvalidFormatException e) { + throw new XmlException(e); + } catch (OpenXML4JException oe) { + throw new XmlException(oe); + } + + while (iter.hasNext()) { + InputStream stream = iter.next(); + sheetParts.add(iter.getSheetPart()); + + SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml); + CommentsTable comments = iter.getSheetComments(); + + // Start, and output the sheet name + xhtml.startElement("div"); + xhtml.element("h1", iter.getSheetName()); + + // Extract the main sheet contents + xhtml.startElement("table"); + xhtml.startElement("tbody"); + + processSheet(sheetExtractor, comments, styles, strings, stream); + + xhtml.endElement("tbody"); + xhtml.endElement("table"); + + // Output any headers and footers + // (Need to process the sheet to get them, so we can't + // do the headers before the contents) + for (String header : sheetExtractor.headers) { + extractHeaderFooter(header, xhtml); + } + for (String footer : sheetExtractor.footers) { + extractHeaderFooter(footer, xhtml); + } + processShapes(iter.getShapes(), xhtml); + // All done with this sheet + xhtml.endElement("div"); + } + } + + private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml) + throws SAXException { + String content = ExcelExtractor._extractHeaderFooter( + new HeaderFooterFromString(hf)); + if (content.length() > 0) { + xhtml.element("p", content); + } + } + + private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException { + if (shapes == null) { + return; + } + for (XSSFShape shape : shapes) { + if (shape instanceof XSSFSimpleShape) { + String sText = ((XSSFSimpleShape) shape).getText(); + if (sText != null && sText.length() > 0) { + xhtml.element("p", sText); + } + } + } + } + + public void processSheet( + SheetContentsHandler sheetContentsExtractor, + CommentsTable comments, + StylesTable styles, + ReadOnlySharedStringsTable strings, + InputStream sheetInputStream) + throws IOException, SAXException { + InputSource sheetSource = new InputSource(sheetInputStream); + SAXParserFactory saxFactory = SAXParserFactory.newInstance(); + try { + SAXParser saxParser = saxFactory.newSAXParser(); + XMLReader sheetParser = saxParser.getXMLReader(); + XSSFSheetInterestingPartsCapturer handler = + new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler( + styles, comments, strings, sheetContentsExtractor, formatter, false)); + sheetParser.setContentHandler(handler); + sheetParser.parse(sheetSource); + sheetInputStream.close(); + + if (handler.hasProtection) { + metadata.set(TikaMetadataKeys.PROTECTED, "true"); + } + } catch (ParserConfigurationException e) { + throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); + } + } + + /** + * In Excel files, sheets have things embedded in them, + * and sheet drawings which have the images + */ + @Override + protected List<PackagePart> getMainDocumentParts() throws TikaException { + List<PackagePart> parts = new ArrayList<PackagePart>(); + for (PackagePart part : sheetParts) { + // Add the sheet + parts.add(part); + + // If it has drawings, return those too + try { + for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) { + if (rel.getTargetMode() == TargetMode.INTERNAL) { + PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); + parts.add(rel.getPackage().getPart(relName)); + } + } + } catch (InvalidFormatException e) { + throw new TikaException("Broken OOXML file", e); + } + } + + return parts; + } + + /** + * Turns formatted sheet events into HTML + */ + protected static class SheetTextAsHTML implements SheetContentsHandler { + private XHTMLContentHandler xhtml; + private List<String> headers; + private List<String> footers; + + protected SheetTextAsHTML(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + headers = new ArrayList<String>(); + footers = new ArrayList<String>(); + } + + public void startRow(int rowNum) { + try { + xhtml.startElement("tr"); + } catch (SAXException e) { + } + } + + public void endRow(int rowNum) { + try { + xhtml.endElement("tr"); + } catch (SAXException e) { + } + } + + public void cell(String cellRef, String formattedValue, XSSFComment comment) { + try { + xhtml.startElement("td"); + + // Main cell contents + if (formattedValue != null) { + xhtml.characters(formattedValue); + } + + // Comments + if (comment != null) { + xhtml.startElement("br"); + xhtml.endElement("br"); + xhtml.characters(comment.getAuthor()); + xhtml.characters(": "); + xhtml.characters(comment.getString().getString()); + } + + xhtml.endElement("td"); + } catch (SAXException e) { + } + } + + public void headerFooter(String text, boolean isHeader, String tagName) { + if (isHeader) { + headers.add(text); + } else { + footers.add(text); + } + } + } + + protected static class HeaderFooterFromString implements HeaderFooter { + private String text; + + protected HeaderFooterFromString(String text) { + this.text = text; + } + + public String getCenter() { + return hfHelper.getCenterSection(text); + } + + public void setCenter(String paramString) { + } + + public String getLeft() { + return hfHelper.getLeftSection(text); + } + + public void setLeft(String paramString) { + } + + public String getRight() { + return hfHelper.getRightSection(text); + } + + public void setRight(String paramString) { + } + } + + /** + * Captures information on interesting tags, whilst + * delegating the main work to the formatting handler + */ + protected static class XSSFSheetInterestingPartsCapturer implements ContentHandler { + private ContentHandler delegate; + private boolean hasProtection = false; + + protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) { + this.delegate = delegate; + } + + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + if ("sheetProtection".equals(qName)) { + hasProtection = true; + } + delegate.startElement(uri, localName, qName, atts); + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + delegate.characters(ch, start, length); + } + + public void endDocument() throws SAXException { + delegate.endDocument(); + } + + public void endElement(String uri, String localName, String qName) + throws SAXException { + delegate.endElement(uri, localName, qName); + } + + public void endPrefixMapping(String prefix) throws SAXException { + delegate.endPrefixMapping(prefix); + } + + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + delegate.ignorableWhitespace(ch, start, length); + } + + public void processingInstruction(String target, String data) + throws SAXException { + delegate.processingInstruction(target, data); + } + + public void setDocumentLocator(Locator locator) { + delegate.setDocumentLocator(locator); + } + + public void skippedEntity(String name) throws SAXException { + delegate.skippedEntity(name); + } + + public void startDocument() throws SAXException { + delegate.startDocument(); + } + + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + delegate.startPrefixMapping(prefix, uri); + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import org.apache.poi.xwpf.usermodel.XWPFAbstractNum; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFNum; +import org.apache.poi.xwpf.usermodel.XWPFNumbering; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.apache.tika.parser.microsoft.AbstractListManager; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNum; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl; + + +public class XWPFListManager extends AbstractListManager { + private final static boolean OVERRIDE_AVAILABLE; + private final static String SKIP_FORMAT = Character.toString((char) 61623);//if this shows up as the lvlText, don't show a number + + static { + boolean b = false; + try { + Class.forName("org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl"); + b = true; + } catch (ClassNotFoundException e) { + } + b = OVERRIDE_AVAILABLE = false; + + } + + private final XWPFNumbering numbering; + + //map of numId (which paragraph series is this a member of?), levelcounts + public XWPFListManager(XWPFDocument document) { + numbering = document.getNumbering(); + } + + /** + * + * @param paragraph paragraph + * @return the formatted number or an empty string if something went wrong + */ + public String getFormattedNumber(final XWPFParagraph paragraph) { + int currNumId = paragraph.getNumID().intValue(); + XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID()); + if (xwpfNum == null) { + return ""; + } + CTNum ctNum = xwpfNum.getCTNum(); + CTDecimalNumber abNum = ctNum.getAbstractNumId(); + int currAbNumId = abNum.getVal().intValue(); + + ParagraphLevelCounter lc = listLevelMap.get(currAbNumId); + LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId); + if (lc == null) { + lc = loadLevelTuples(abNum); + } + if (overrideTuples == null) { + overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels()); + } + + String formattedString = lc.incrementLevel(paragraph.getNumIlvl().intValue(), overrideTuples); + + listLevelMap.put(currAbNumId, lc); + overrideTupleMap.put(currNumId, overrideTuples); + + return formattedString; + } + + private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) { + LevelTuple[] levelTuples = new LevelTuple[length]; + int overrideLength = ctNum.sizeOfLvlOverrideArray(); + if (overrideLength == 0) { + return null; + } + for (int i = 0; i < length; i++) { + LevelTuple tuple; + if (i >= overrideLength) { + tuple = new LevelTuple("%"+i+"."); + } else { + CTNumLvl ctNumLvl = ctNum.getLvlOverrideArray(i); + if (ctNumLvl != null) { + tuple = buildTuple(i, ctNumLvl.getLvl()); + } else { + tuple = new LevelTuple("%"+i+"."); + } + } + levelTuples[i] = tuple; + } + return levelTuples; + } + + + private ParagraphLevelCounter loadLevelTuples(CTDecimalNumber abNum) { + //Unfortunately, we need to go this far into the underlying structure + //to get the abstract num information for the edge case where + //someone skips a level and the format is not context-free, e.g. "1.B.i". + XWPFAbstractNum abstractNum = numbering.getAbstractNum(abNum.getVal()); + CTAbstractNum ctAbstractNum = abstractNum.getCTAbstractNum(); + + LevelTuple[] levels = new LevelTuple[ctAbstractNum.sizeOfLvlArray()]; + for (int i = 0; i < levels.length; i++) { + levels[i] = buildTuple(i, ctAbstractNum.getLvlArray(i)); + } + return new ParagraphLevelCounter(levels); + } + + private LevelTuple buildTuple(int level, CTLvl ctLvl) { + boolean isLegal = false; + int start = 1; + int restart = -1; + String lvlText = "%" + level + "."; + String numFmt = "decimal"; + + + if (ctLvl != null && ctLvl.getIsLgl() != null) { + isLegal = true; + } + + if (ctLvl != null && ctLvl.getNumFmt() != null && + ctLvl.getNumFmt().getVal() != null) { + numFmt = ctLvl.getNumFmt().getVal().toString(); + } + if (ctLvl != null && ctLvl.getLvlRestart() != null && + ctLvl.getLvlRestart().getVal() != null) { + restart = ctLvl.getLvlRestart().getVal().intValue(); + } + if (ctLvl != null && ctLvl.getStart() != null && + ctLvl.getStart().getVal() != null) { + start = ctLvl.getStart().getVal().intValue(); + } else { + + //this is a hack. Currently, this gets the lowest possible + //start for a given numFmt. We should probably try to grab the + //restartNumberingAfterBreak value in + //e.g. <w:abstractNum w:abstractNumId="12" w15:restartNumberingAfterBreak="0">??? + if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) || "decimalZero".equals(numFmt)) { + start = 0; + } else { + start = 1; + } + } + if (ctLvl != null && ctLvl.getLvlText() != null && ctLvl.getLvlText().getVal() != null) { + lvlText = ctLvl.getLvlText().getVal(); + } + return new LevelTuple(start, restart, lvlText, numFmt, isLegal); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,459 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import javax.xml.namespace.QName; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.poi.openxml4j.opc.PackagePart; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import org.apache.poi.xwpf.model.XWPFCommentsDecorator; +import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; +import org.apache.poi.xwpf.usermodel.BodyType; +import org.apache.poi.xwpf.usermodel.IBody; +import org.apache.poi.xwpf.usermodel.IBodyElement; +import org.apache.poi.xwpf.usermodel.ICell; +import org.apache.poi.xwpf.usermodel.IRunElement; +import org.apache.poi.xwpf.usermodel.ISDTContent; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter; +import org.apache.poi.xwpf.usermodel.XWPFHyperlink; +import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.apache.poi.xwpf.usermodel.XWPFPicture; +import org.apache.poi.xwpf.usermodel.XWPFPictureData; +import org.apache.poi.xwpf.usermodel.XWPFRun; +import org.apache.poi.xwpf.usermodel.XWPFSDT; +import org.apache.poi.xwpf.usermodel.XWPFSDTCell; +import org.apache.poi.xwpf.usermodel.XWPFStyle; +import org.apache.poi.xwpf.usermodel.XWPFStyles; +import org.apache.poi.xwpf.usermodel.XWPFTable; +import org.apache.poi.xwpf.usermodel.XWPFTableCell; +import org.apache.poi.xwpf.usermodel.XWPFTableRow; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.microsoft.WordExtractor; +import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.xmlbeans.XmlCursor; +import org.apache.xmlbeans.XmlException; +import org.apache.xmlbeans.XmlObject; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor { + + // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3 + private static final String LIST_DELIMITER = " "; + + + private XWPFDocument document; + private XWPFStyles styles; + + public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) { + super(context, extractor); + + document = (XWPFDocument) extractor.getDocument(); + styles = document.getStyles(); + } + + /** + * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText() + */ + @Override + protected void buildXHTML(XHTMLContentHandler xhtml) + throws SAXException, XmlException, IOException { + XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy(); + XWPFListManager listManager = new XWPFListManager(document); + // headers + if (hfPolicy != null) { + extractHeaders(xhtml, hfPolicy, listManager); + } + + // process text in the order that it occurs in + extractIBodyText(document, listManager, xhtml); + + // then all document tables + if (hfPolicy != null) { + extractFooters(xhtml, hfPolicy, listManager); + } + } + + private void extractIBodyText(IBody bodyElement, XWPFListManager listManager, + XHTMLContentHandler xhtml) + throws SAXException, XmlException, IOException { + for (IBodyElement element : bodyElement.getBodyElements()) { + if (element instanceof XWPFParagraph) { + XWPFParagraph paragraph = (XWPFParagraph) element; + extractParagraph(paragraph, listManager, xhtml); + } + if (element instanceof XWPFTable) { + XWPFTable table = (XWPFTable) element; + extractTable(table, listManager, xhtml); + } + if (element instanceof XWPFSDT) { + extractSDT((XWPFSDT) element, xhtml); + } + + } + } + + private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws SAXException, + XmlException, IOException { + ISDTContent content = element.getContent(); + String tag = "p"; + xhtml.startElement(tag); + xhtml.characters(content.getText()); + xhtml.endElement(tag); + } + + private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager, + XHTMLContentHandler xhtml) + throws SAXException, XmlException, IOException { + // If this paragraph is actually a whole new section, then + // it could have its own headers and footers + // Check and handle if so + XWPFHeaderFooterPolicy headerFooterPolicy = null; + if (paragraph.getCTP().getPPr() != null) { + CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr(); + if (ctSectPr != null) { + headerFooterPolicy = + new XWPFHeaderFooterPolicy(document, ctSectPr); + extractHeaders(xhtml, headerFooterPolicy, listManager); + } + } + + // Is this a paragraph, or a heading? + String tag = "p"; + String styleClass = null; + if (paragraph.getStyleID() != null) { + XWPFStyle style = styles.getStyle( + paragraph.getStyleID() + ); + + if (style != null && style.getName() != null) { + TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle( + style.getName(), paragraph.getPartType() == BodyType.TABLECELL + ); + tag = tas.getTag(); + styleClass = tas.getStyleClass(); + } + } + + if (styleClass == null) { + xhtml.startElement(tag); + } else { + xhtml.startElement(tag, "class", styleClass); + } + + writeParagraphNumber(paragraph, listManager, xhtml); + // Output placeholder for any embedded docs: + + // TODO: replace w/ XPath/XQuery: + for (XWPFRun run : paragraph.getRuns()) { + XmlCursor c = run.getCTR().newCursor(); + c.selectPath("./*"); + while (c.toNextSelection()) { + XmlObject o = c.getObject(); + if (o instanceof CTObject) { + XmlCursor c2 = o.newCursor(); + c2.selectPath("./*"); + while (c2.toNextSelection()) { + XmlObject o2 = c2.getObject(); + + XmlObject embedAtt = o2.selectAttribute(new QName("Type")); + if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) { + // Type is "Embed" + XmlObject relIDAtt = o2.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id")); + if (relIDAtt != null) { + String relID = relIDAtt.getDomNode().getNodeValue(); + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", relID); + xhtml.startElement("div", attributes); + xhtml.endElement("div"); + } + } + } + c2.dispose(); + } + } + + c.dispose(); + } + + // Attach bookmarks for the paragraph + // (In future, we might put them in the right place, for now + // we just put them in the correct paragraph) + for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) { + CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i); + xhtml.startElement("a", "name", bookmark.getName()); + xhtml.endElement("a"); + } + + TmpFormatting fmtg = new TmpFormatting(false, false); + + // Do the iruns + for (IRunElement run : paragraph.getIRuns()) { + if (run instanceof XWPFSDT) { + fmtg = closeStyleTags(xhtml, fmtg); + processSDTRun((XWPFSDT) run, xhtml); + //for now, we're ignoring formatting in sdt + //if you hit an sdt reset to false + fmtg.setBold(false); + fmtg.setItalic(false); + } else { + fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg); + } + } + closeStyleTags(xhtml, fmtg); + + + // Now do any comments for the paragraph + XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null); + String commentText = comments.getCommentText(); + if (commentText != null && commentText.length() > 0) { + xhtml.characters(commentText); + } + + String footnameText = paragraph.getFootnoteText(); + if (footnameText != null && footnameText.length() > 0) { + xhtml.characters(footnameText + "\n"); + } + + // Also extract any paragraphs embedded in text boxes: + for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) { + extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml); + } + + // Finish this paragraph + xhtml.endElement(tag); + + if (headerFooterPolicy != null) { + extractFooters(xhtml, headerFooterPolicy, listManager); + } + } + + private void writeParagraphNumber(XWPFParagraph paragraph, + XWPFListManager listManager, + XHTMLContentHandler xhtml) throws SAXException { + if (paragraph.getNumIlvl() == null) { + return; + } + String number = listManager.getFormattedNumber(paragraph); + if (number != null) { + xhtml.characters(number); + } + + } + + private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml, + TmpFormatting fmtg) throws SAXException { + // Close any still open style tags + if (fmtg.isItalic()) { + xhtml.endElement("i"); + fmtg.setItalic(false); + } + if (fmtg.isBold()) { + xhtml.endElement("b"); + fmtg.setBold(false); + } + return fmtg; + } + + private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph, + XHTMLContentHandler xhtml, TmpFormatting tfmtg) + throws SAXException, XmlException, IOException { + // True if we are currently in the named style tag: + if (run.isBold() != tfmtg.isBold()) { + if (tfmtg.isItalic()) { + xhtml.endElement("i"); + tfmtg.setItalic(false); + } + if (run.isBold()) { + xhtml.startElement("b"); + } else { + xhtml.endElement("b"); + } + tfmtg.setBold(run.isBold()); + } + + if (run.isItalic() != tfmtg.isItalic()) { + if (run.isItalic()) { + xhtml.startElement("i"); + } else { + xhtml.endElement("i"); + } + tfmtg.setItalic(run.isItalic()); + } + + boolean addedHREF = false; + if (run instanceof XWPFHyperlinkRun) { + XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun) run; + XWPFHyperlink link = linkRun.getHyperlink(document); + if (link != null && link.getURL() != null) { + xhtml.startElement("a", "href", link.getURL()); + addedHREF = true; + } else if (linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) { + xhtml.startElement("a", "href", "#" + linkRun.getAnchor()); + addedHREF = true; + } + } + + xhtml.characters(run.toString()); + + // If we have any pictures, output them + for (XWPFPicture picture : run.getEmbeddedPictures()) { + if (paragraph.getDocument() != null) { + XWPFPictureData data = picture.getPictureData(); + if (data != null) { + AttributesImpl attr = new AttributesImpl(); + + attr.addAttribute("", "src", "src", "CDATA", "embedded:" + data.getFileName()); + attr.addAttribute("", "alt", "alt", "CDATA", picture.getDescription()); + + xhtml.startElement("img", attr); + xhtml.endElement("img"); + } + } + } + + if (addedHREF) { + xhtml.endElement("a"); + } + + return tfmtg; + } + + private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml) + throws SAXException, XmlException, IOException { + xhtml.characters(run.getContent().getText()); + } + + private void extractTable(XWPFTable table, XWPFListManager listManager, + XHTMLContentHandler xhtml) + throws SAXException, XmlException, IOException { + xhtml.startElement("table"); + xhtml.startElement("tbody"); + for (XWPFTableRow row : table.getRows()) { + xhtml.startElement("tr"); + for (ICell cell : row.getTableICells()) { + xhtml.startElement("td"); + if (cell instanceof XWPFTableCell) { + extractIBodyText((XWPFTableCell) cell, listManager, xhtml); + } else if (cell instanceof XWPFSDTCell) { + xhtml.characters(((XWPFSDTCell) cell).getContent().getText()); + } + xhtml.endElement("td"); + } + xhtml.endElement("tr"); + } + xhtml.endElement("tbody"); + xhtml.endElement("table"); + } + + private void extractFooters( + XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, + XWPFListManager listManager) + throws SAXException, XmlException, IOException { + // footers + if (hfPolicy.getFirstPageFooter() != null) { + extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(), listManager); + } + if (hfPolicy.getEvenPageFooter() != null) { + extractHeaderText(xhtml, hfPolicy.getEvenPageFooter(), listManager); + } + if (hfPolicy.getDefaultFooter() != null) { + extractHeaderText(xhtml, hfPolicy.getDefaultFooter(), listManager); + } + } + + private void extractHeaders( + XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, XWPFListManager listManager) + throws SAXException, XmlException, IOException { + if (hfPolicy == null) return; + + if (hfPolicy.getFirstPageHeader() != null) { + extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), listManager); + } + + if (hfPolicy.getEvenPageHeader() != null) { + extractHeaderText(xhtml, hfPolicy.getEvenPageHeader(), listManager); + } + + if (hfPolicy.getDefaultHeader() != null) { + extractHeaderText(xhtml, hfPolicy.getDefaultHeader(), listManager); + } + } + + private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header, XWPFListManager listManager) throws SAXException, XmlException, IOException { + + for (IBodyElement e : header.getBodyElements()) { + if (e instanceof XWPFParagraph) { + extractParagraph((XWPFParagraph) e, listManager, xhtml); + } else if (e instanceof XWPFTable) { + extractTable((XWPFTable) e, listManager, xhtml); + } else if (e instanceof XWPFSDT) { + extractSDT((XWPFSDT) e, xhtml); + } + } + } + + /** + * Word documents are simple, they only have the one + * main part + */ + @Override + protected List<PackagePart> getMainDocumentParts() { + List<PackagePart> parts = new ArrayList<PackagePart>(); + parts.add(document.getPackagePart()); + return parts; + } + + private class TmpFormatting { + private boolean bold = false; + private boolean italic = false; + + private TmpFormatting(boolean bold, boolean italic) { + this.bold = bold; + this.italic = italic; + } + + public boolean isBold() { + return bold; + } + + public void setBold(boolean bold) { + this.bold = bold; + } + + public boolean isItalic() { + return italic; + } + + public void setItalic(boolean italic) { + this.italic = italic; + } + + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Locale; + +import org.apache.tika.sax.ContentHandlerDecorator; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Content handler decorator that:<ul> + * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li> + * <li>Returns a fake DTD when parser requests OpenOffice DTD</li> + * </ul> + */ +public class NSNormalizerContentHandler extends ContentHandlerDecorator { + + private static final String OLD_NS = + "http://openoffice.org/2000/"; + + private static final String NEW_NS = + "urn:oasis:names:tc:opendocument:xmlns:"; + + private static final String DTD_PUBLIC_ID = + "-//OpenOffice.org//DTD OfficeDocument 1.0//EN"; + + public NSNormalizerContentHandler(ContentHandler handler) { + super(handler); + } + + private String mapOldNS(String ns) { + if (ns != null && ns.startsWith(OLD_NS)) { + return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0"; + } else { + return ns; + } + } + + @Override + public void startElement( + String namespaceURI, String localName, String qName, + Attributes atts) throws SAXException { + AttributesImpl natts = new AttributesImpl(); + for (int i = 0; i < atts.getLength(); i++) { + natts.addAttribute( + mapOldNS(atts.getURI(i)), atts.getLocalName(i), + atts.getQName(i), atts.getType(i), atts.getValue(i)); + } + super.startElement(mapOldNS(namespaceURI), localName, qName, atts); + } + + @Override + public void endElement(String namespaceURI, String localName, String qName) + throws SAXException { + super.endElement(mapOldNS(namespaceURI), localName, qName); + } + + @Override + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + super.startPrefixMapping(prefix, mapOldNS(uri)); + } + + /** + * do not load any DTDs (may be requested by parser). Fake the DTD by + * returning a empty string as InputSource + */ + @Override + public InputSource resolveEntity(String publicId, String systemId) + throws IOException, SAXException { + if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd")) + || DTD_PUBLIC_ID.equals(publicId)) { + return new InputSource(new StringReader("")); + } else { + return super.resolveEntity(publicId, systemId); + } + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,515 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.odf; + +import javax.xml.XMLConstants; +import javax.xml.namespace.QName; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.util.BitSet; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.Stack; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.ElementMappingContentHandler; +import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.sax.ElementMappingContentHandler.TargetElement; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.SAXNotRecognizedException; +import org.xml.sax.helpers.AttributesImpl; +import org.xml.sax.helpers.DefaultHandler; + +import static org.apache.tika.sax.XHTMLContentHandler.XHTML; + +/** + * Parser for ODF <code>content.xml</code> files. + */ +public class OpenDocumentContentParser extends AbstractParser { + private interface Style { + } + + private static class TextStyle implements Style { + public boolean italic; + public boolean bold; + public boolean underlined; + } + + private static class ListStyle implements Style { + public boolean ordered; + + public String getTag() { + return ordered ? "ol" : "ul"; + } + } + + private static final class OpenDocumentElementMappingContentHandler extends + ElementMappingContentHandler { + private final ContentHandler handler; + private final BitSet textNodeStack = new BitSet(); + private int nodeDepth = 0; + private int completelyFiltered = 0; + private Stack<String> headingStack = new Stack<String>(); + private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>(); + private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>(); + private TextStyle textStyle; + private TextStyle lastTextStyle; + private Stack<ListStyle> listStyleStack = new Stack<ListStyle>(); + private ListStyle listStyle; + + private OpenDocumentElementMappingContentHandler(ContentHandler handler, + Map<QName, TargetElement> mappings) { + super(handler, mappings); + this.handler = handler; + } + + @Override + public void characters(char[] ch, int start, int length) + throws SAXException { + // only forward content of tags from text:-namespace + if (completelyFiltered == 0 && nodeDepth > 0 + && textNodeStack.get(nodeDepth - 1)) { + lazyEndSpan(); + super.characters(ch, start, length); + } + } + + // helper for checking tags which need complete filtering + // (with sub-tags) + private boolean needsCompleteFiltering( + String namespaceURI, String localName) { + if (TEXT_NS.equals(namespaceURI)) { + return localName.endsWith("-template") + || localName.endsWith("-style"); + } + return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName); + } + + // map the heading level to <hX> HTML tags + private String getXHTMLHeaderTagName(Attributes atts) { + String depthStr = atts.getValue(TEXT_NS, "outline-level"); + if (depthStr == null) { + return "h1"; + } + + int depth = Integer.parseInt(depthStr); + if (depth >= 6) { + return "h6"; + } else if (depth <= 1) { + return "h1"; + } else { + return "h" + depth; + } + } + + /** + * Check if a node is a text node + */ + private boolean isTextNode(String namespaceURI, String localName) { + if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) { + return true; + } + if (SVG_NS.equals(namespaceURI)) { + return "title".equals(localName) || + "desc".equals(localName); + } + return false; + } + + private void startList(String name) throws SAXException { + String elementName = "ul"; + if (name != null) { + ListStyle style = listStyleMap.get(name); + elementName = style != null ? style.getTag() : "ul"; + listStyleStack.push(style); + } + handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES); + } + + private void endList() throws SAXException { + String elementName = "ul"; + if (!listStyleStack.isEmpty()) { + ListStyle style = listStyleStack.pop(); + elementName = style != null ? style.getTag() : "ul"; + } + handler.endElement(XHTML, elementName, elementName); + } + + private void startSpan(String name) throws SAXException { + if (name == null) { + return; + } + + TextStyle style = textStyleMap.get(name); + if (style == null) { + return; + } + + // End tags that refer to no longer valid styles + if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) { + handler.endElement(XHTML, "u", "u"); + } + if (!style.italic && lastTextStyle != null && lastTextStyle.italic) { + handler.endElement(XHTML, "i", "i"); + } + if (!style.bold && lastTextStyle != null && lastTextStyle.bold) { + handler.endElement(XHTML, "b", "b"); + } + + // Start tags for new styles + if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) { + handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES); + } + if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) { + handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES); + } + if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) { + handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES); + } + + textStyle = style; + lastTextStyle = null; + } + + private void endSpan() throws SAXException { + lastTextStyle = textStyle; + textStyle = null; + } + + private void lazyEndSpan() throws SAXException { + if (lastTextStyle == null) { + return; + } + + if (lastTextStyle.underlined) { + handler.endElement(XHTML, "u", "u"); + } + if (lastTextStyle.italic) { + handler.endElement(XHTML, "i", "i"); + } + if (lastTextStyle.bold) { + handler.endElement(XHTML, "b", "b"); + } + + lastTextStyle = null; + } + + @Override + public void startElement( + String namespaceURI, String localName, String qName, + Attributes attrs) throws SAXException { + // keep track of current node type. If it is a text node, + // a bit at the current depth its set in textNodeStack. + // characters() checks the top bit to determine, if the + // actual node is a text node to print out nodeDepth contains + // the depth of the current node and also marks top of stack. + assert nodeDepth >= 0; + + // Set styles + if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) { + String family = attrs.getValue(STYLE_NS, "family"); + if ("text".equals(family)) { + textStyle = new TextStyle(); + String name = attrs.getValue(STYLE_NS, "name"); + textStyleMap.put(name, textStyle); + } + } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) { + listStyle = new ListStyle(); + String name = attrs.getValue(STYLE_NS, "name"); + listStyleMap.put(name, listStyle); + } else if (textStyle != null && STYLE_NS.equals(namespaceURI) + && "text-properties".equals(localName)) { + String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style"); + if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) { + textStyle.italic = true; + } + String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight"); + if ("bold".equals(fontWeight) || "bolder".equals(fontWeight) + || (fontWeight != null && Character.isDigit(fontWeight.charAt(0)) + && Integer.valueOf(fontWeight) > 500)) { + textStyle.bold = true; + } + String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style"); + if (underlineStyle != null) { + textStyle.underlined = true; + } + } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) { + if ("list-level-style-bullet".equals(localName)) { + listStyle.ordered = false; + } else if ("list-level-style-number".equals(localName)) { + listStyle.ordered = true; + } + } + + textNodeStack.set(nodeDepth++, + isTextNode(namespaceURI, localName)); + // filter *all* content of some tags + assert completelyFiltered >= 0; + + if (needsCompleteFiltering(namespaceURI, localName)) { + completelyFiltered++; + } + // call next handler if no filtering + if (completelyFiltered == 0) { + // special handling of text:h, that are directly passed + // to incoming handler + if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { + final String el = headingStack.push(getXHTMLHeaderTagName(attrs)); + handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES); + } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) { + startList(attrs.getValue(TEXT_NS, "style-name")); + } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { + startSpan(attrs.getValue(TEXT_NS, "style-name")); + } else { + super.startElement(namespaceURI, localName, qName, attrs); + } + } + } + + @Override + public void endElement( + String namespaceURI, String localName, String qName) + throws SAXException { + if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) { + textStyle = null; + } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) { + listStyle = null; + } + + // call next handler if no filtering + if (completelyFiltered == 0) { + // special handling of text:h, that are directly passed + // to incoming handler + if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { + final String el = headingStack.pop(); + handler.endElement(XHTMLContentHandler.XHTML, el, el); + } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) { + endList(); + } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) { + endSpan(); + } else { + if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) { + lazyEndSpan(); + } + super.endElement(namespaceURI, localName, qName); + } + + // special handling of tabulators + if (TEXT_NS.equals(namespaceURI) + && ("tab-stop".equals(localName) + || "tab".equals(localName))) { + this.characters(TAB, 0, TAB.length); + } + } + + // revert filter for *all* content of some tags + if (needsCompleteFiltering(namespaceURI, localName)) { + completelyFiltered--; + } + assert completelyFiltered >= 0; + + // reduce current node depth + nodeDepth--; + assert nodeDepth >= 0; + } + + @Override + public void startPrefixMapping(String prefix, String uri) { + // remove prefix mappings as they should not occur in XHTML + } + + @Override + public void endPrefixMapping(String prefix) { + // remove prefix mappings as they should not occur in XHTML + } + } + + public static final String TEXT_NS = + "urn:oasis:names:tc:opendocument:xmlns:text:1.0"; + + public static final String TABLE_NS = + "urn:oasis:names:tc:opendocument:xmlns:table:1.0"; + + public static final String STYLE_NS = + "urn:oasis:names:tc:opendocument:xmlns:style:1.0"; + + public static final String FORMATTING_OBJECTS_NS = + "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0"; + + public static final String OFFICE_NS = + "urn:oasis:names:tc:opendocument:xmlns:office:1.0"; + + public static final String SVG_NS = + "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"; + + public static final String PRESENTATION_NS = + "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0"; + + public static final String DRAW_NS = + "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"; + + public static final String XLINK_NS = "http://www.w3.org/1999/xlink"; + + protected static final char[] TAB = new char[]{'\t'}; + + private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); + + /** + * Mappings between ODF tag names and XHTML tag names + * (including attributes). All other tag names/attributes are ignored + * and left out from event stream. + */ + private static final HashMap<QName, TargetElement> MAPPINGS = + new HashMap<QName, TargetElement>(); + + static { + // general mappings of text:-tags + MAPPINGS.put( + new QName(TEXT_NS, "p"), + new TargetElement(XHTML, "p")); + // text:h-tags are mapped specifically in startElement/endElement + MAPPINGS.put( + new QName(TEXT_NS, "line-break"), + new TargetElement(XHTML, "br")); + MAPPINGS.put( + new QName(TEXT_NS, "list-item"), + new TargetElement(XHTML, "li")); + MAPPINGS.put( + new QName(TEXT_NS, "note"), + new TargetElement(XHTML, "div")); + MAPPINGS.put( + new QName(OFFICE_NS, "annotation"), + new TargetElement(XHTML, "div")); + MAPPINGS.put( + new QName(PRESENTATION_NS, "notes"), + new TargetElement(XHTML, "div")); + MAPPINGS.put( + new QName(DRAW_NS, "object"), + new TargetElement(XHTML, "object")); + MAPPINGS.put( + new QName(DRAW_NS, "text-box"), + new TargetElement(XHTML, "div")); + MAPPINGS.put( + new QName(SVG_NS, "title"), + new TargetElement(XHTML, "span")); + MAPPINGS.put( + new QName(SVG_NS, "desc"), + new TargetElement(XHTML, "span")); + MAPPINGS.put( + new QName(TEXT_NS, "span"), + new TargetElement(XHTML, "span")); + + final HashMap<QName, QName> aAttsMapping = + new HashMap<QName, QName>(); + aAttsMapping.put( + new QName(XLINK_NS, "href"), + new QName("href")); + aAttsMapping.put( + new QName(XLINK_NS, "title"), + new QName("title")); + MAPPINGS.put( + new QName(TEXT_NS, "a"), + new TargetElement(XHTML, "a", aAttsMapping)); + + // create HTML tables from table:-tags + MAPPINGS.put( + new QName(TABLE_NS, "table"), + new TargetElement(XHTML, "table")); + // repeating of rows is ignored; for columns, see below! + MAPPINGS.put( + new QName(TABLE_NS, "table-row"), + new TargetElement(XHTML, "tr")); + // special mapping for rowspan/colspan attributes + final HashMap<QName, QName> tableCellAttsMapping = + new HashMap<QName, QName>(); + tableCellAttsMapping.put( + new QName(TABLE_NS, "number-columns-spanned"), + new QName("colspan")); + tableCellAttsMapping.put( + new QName(TABLE_NS, "number-rows-spanned"), + new QName("rowspan")); + /* TODO: The following is not correct, the cell should be repeated not spanned! + * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct. + * Problems may occur when both spanning and repeating is given, which is not allowed by spec. + * Cell spanning instead of repeating is not a problem, because OpenOffice uses it + * only for empty cells. + */ + tableCellAttsMapping.put( + new QName(TABLE_NS, "number-columns-repeated"), + new QName("colspan")); + MAPPINGS.put( + new QName(TABLE_NS, "table-cell"), + new TargetElement(XHTML, "td", tableCellAttsMapping)); + } + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return Collections.emptySet(); // not a top-level parser + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + parseInternal(stream, + new XHTMLContentHandler(handler, metadata), + metadata, context); + } + + void parseInternal( + InputStream stream, final ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS); + + try { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setValidating(false); + factory.setNamespaceAware(true); + try { + factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + } catch (SAXNotRecognizedException e) { + // TIKA-329: Some XML parsers do not support the secure-processing + // feature, even though it's required by JAXP in Java 5. Ignoring + // the exception is fine here, deployments without this feature + // are inherently vulnerable to XML denial-of-service attacks. + } + SAXParser parser = factory.newSAXParser(); + parser.parse( + new CloseShieldInputStream(stream), + new OfflineContentHandler( + new NSNormalizerContentHandler(dh))); + } catch (ParserConfigurationException e) { + throw new TikaException("XML parser configuration error", e); + } + } + +}
