ti...

bob Tue, 05 Jan 2016 19:51:48 -0800

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import javax.xml.namespace.QName;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.xslf.XSLFSlideShow;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.Placeholder;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
+import org.apache.poi.xslf.usermodel.XSLFComments;
+import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
+import org.apache.poi.xslf.usermodel.XSLFGroupShape;
+import org.apache.poi.xslf.usermodel.XSLFNotes;
+import org.apache.poi.xslf.usermodel.XSLFNotesMaster;
+import org.apache.poi.xslf.usermodel.XSLFPictureShape;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xslf.usermodel.XSLFShape;
+import org.apache.poi.xslf.usermodel.XSLFSheet;
+import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
+import org.apache.poi.xslf.usermodel.XSLFTable;
+import org.apache.poi.xslf.usermodel.XSLFTableCell;
+import org.apache.poi.xslf.usermodel.XSLFTableRow;
+import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
+import org.apache.poi.xslf.usermodel.XSLFTextShape;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.apache.xmlbeans.XmlObject;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+    public XSLFPowerPointExtractorDecorator(ParseContext context, 
XSLFPowerPointExtractor extractor) {
+        super(context, extractor);
+    }
+
+    /**
+     * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+     */
+    protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, 
IOException {
+        XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
+        XSLFCommentAuthors commentAuthors = slideShow.getCommentAuthors();
+
+        List<XSLFSlide> slides = slideShow.getSlides();
+        for (XSLFSlide slide : slides) {
+            String slideDesc;
+            if (slide.getPackagePart() != null && 
slide.getPackagePart().getPartName() != null) {
+                slideDesc = 
getJustFileName(slide.getPackagePart().getPartName().toString());
+                slideDesc += "_";
+            } else {
+                slideDesc = null;
+            }
+
+            // slide content
+            xhtml.startElement("div", "class", "slide-content");
+            extractContent(slide.getShapes(), false, xhtml, slideDesc);
+            xhtml.endElement("div");
+
+            // slide layout which is the master sheet for this slide
+            xhtml.startElement("div", "class", "slide-master-content");
+            XSLFSlideLayout slideLayout = slide.getMasterSheet();
+            extractContent(slideLayout.getShapes(), true, xhtml, null);
+            xhtml.endElement("div");
+
+            // slide master which is the master sheet for all text layouts
+            XSLFSheet slideMaster = slideLayout.getMasterSheet();
+            extractContent(slideMaster.getShapes(), true, xhtml, null);
+
+            // notes (if present)
+            XSLFNotes slideNotes = slide.getNotes();
+            if (slideNotes != null) {
+                xhtml.startElement("div", "class", "slide-notes");
+
+                extractContent(slideNotes.getShapes(), false, xhtml, 
slideDesc);
+
+                // master sheet for this notes
+                XSLFNotesMaster notesMaster = slideNotes.getMasterSheet();
+                extractContent(notesMaster.getShapes(), true, xhtml, null);
+                xhtml.endElement("div");
+            }
+
+            // comments (if present)
+            XSLFComments comments = slide.getComments();
+            if (comments != null) {
+                StringBuilder authorStringBuilder = new StringBuilder();
+                for (int i = 0; i < comments.getNumberOfComments(); i++) {
+                    authorStringBuilder.setLength(0);
+                    CTComment comment = comments.getCommentAt(i);
+                    xhtml.startElement("p", "class", "slide-comment");
+                    CTCommentAuthor cta = 
commentAuthors.getAuthorById(comment.getAuthorId());
+                    if (cta != null) {
+                        if (cta.getName() != null) {
+                            authorStringBuilder.append(cta.getName());
+                        }
+                        if (cta.getInitials() != null) {
+                            if (authorStringBuilder.length() > 0) {
+                                authorStringBuilder.append(" ");
+                            }
+                            
authorStringBuilder.append("("+cta.getInitials()+")");
+                        }
+                        if (comment.getText() != null && 
authorStringBuilder.length() > 0) {
+                            authorStringBuilder.append(" - ");
+                        }
+                        if (authorStringBuilder.length() > 0) {
+                            xhtml.startElement("b");
+                            xhtml.characters(authorStringBuilder.toString());
+                            xhtml.endElement("b");
+                        }
+                    }
+                    xhtml.characters(comment.getText());
+                    xhtml.endElement("p");
+                }
+            }
+        }
+    }
+
+    private void extractContent(List<? extends XSLFShape> shapes, boolean 
skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc)
+            throws SAXException {
+        for (XSLFShape sh : shapes) {
+            if (sh instanceof XSLFTextShape) {
+                XSLFTextShape txt = (XSLFTextShape) sh;
+                Placeholder ph = txt.getTextType();
+                if (skipPlaceholders && ph != null) {
+                    continue;
+                }
+                for (XSLFTextParagraph p : txt.getTextParagraphs()) {
+                    xhtml.element("p", p.getText());
+                }
+            } else if (sh instanceof XSLFGroupShape) {
+                // recurse into groups of shapes
+                XSLFGroupShape group = (XSLFGroupShape) sh;
+                extractContent(group.getShapes(), skipPlaceholders, xhtml, 
slideDesc);
+            } else if (sh instanceof XSLFTable) {
+                //unlike tables in Word, ppt/x can't have recursive tables...I 
don't think
+                extractTable((XSLFTable)sh, xhtml);
+            } else if (sh instanceof XSLFGraphicFrame) {
+                XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
+                XmlObject[] sp = frame.getXmlObject().selectPath(
+                        "declare namespace 
p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
+                if (sp != null) {
+                    for (XmlObject emb : sp) {
+                        XmlObject relIDAtt = emb.selectAttribute(new 
QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships";, 
"id"));
+                        if (relIDAtt != null) {
+                            String relID = 
relIDAtt.getDomNode().getNodeValue();
+                            if (slideDesc != null) {
+                                relID = slideDesc + relID;
+                            }
+                            AttributesImpl attributes = new AttributesImpl();
+                            attributes.addAttribute("", "class", "class", 
"CDATA", "embedded");
+                            attributes.addAttribute("", "id", "id", "CDATA", 
relID);
+                            xhtml.startElement("div", attributes);
+                            xhtml.endElement("div");
+                        }
+                    }
+                }
+            } else if (sh instanceof XSLFPictureShape) {
+                if (!skipPlaceholders && (sh.getXmlObject() instanceof 
CTPicture)) {
+                    CTPicture ctPic = ((CTPicture) sh.getXmlObject());
+                    if (ctPic.getBlipFill() != null && 
ctPic.getBlipFill().getBlip() != null) {
+                        String relID = 
ctPic.getBlipFill().getBlip().getEmbed();
+                        if (relID != null) {
+                            if (slideDesc != null) {
+                                relID = slideDesc + relID;
+                            }
+                            AttributesImpl attributes = new AttributesImpl();
+                            attributes.addAttribute("", "class", "class", 
"CDATA", "embedded");
+                            attributes.addAttribute("", "id", "id", "CDATA", 
relID);
+                            xhtml.startElement("div", attributes);
+                            xhtml.endElement("div");
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private void extractTable(XSLFTable tbl, XHTMLContentHandler xhtml) throws 
SAXException {
+        xhtml.startElement("table");
+        for (XSLFTableRow row : tbl) {
+            xhtml.startElement("tr");
+            List<XSLFTableCell> cells = row.getCells();
+            for (XSLFTableCell c : row.getCells()) {
+                xhtml.startElement("td");
+                xhtml.characters(c.getText());
+                xhtml.endElement("td");
+            }
+            xhtml.endElement("tr");
+        }
+        xhtml.endElement("table");
+
+    }
+
+    /**
+     * In PowerPoint files, slides have things embedded in them,
+     * and slide drawings which have the images
+     */
+    @Override
+    protected List<PackagePart> getMainDocumentParts() throws TikaException {
+        List<PackagePart> parts = new ArrayList<>();
+        XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
+        XSLFSlideShow document = null;
+        try {
+            document = slideShow._getXSLFSlideShow(); // TODO Avoid this in 
future
+        } catch (Exception e) {
+            throw new TikaException(e.getMessage()); // Shouldn't happen
+        }
+
+        CTSlideIdList ctSlideIdList = document.getSlideReferences();
+        if (ctSlideIdList != null) {
+            for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) {
+                CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i);
+                // Add the slide
+                PackagePart slidePart;
+                try {
+                    slidePart = document.getSlidePart(ctSlide);
+                } catch (IOException e) {
+                    throw new TikaException("Broken OOXML file", e);
+                } catch (XmlException xe) {
+                    throw new TikaException("Broken OOXML file", xe);
+                }
+                parts.add(slidePart);
+
+                // If it has drawings, return those too
+                try {
+                    for (PackageRelationship rel : 
slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
+                        if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                            PackagePartName relName = 
PackagingURIHelper.createPartName(rel.getTargetURI());
+                            parts.add(rel.getPackage().getPart(relName));
+                        }
+                    }
+                } catch (InvalidFormatException e) {
+                    throw new TikaException("Broken OOXML file", e);
+                }
+            }
+        }
+        return parts;
+    }
+}


Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.HeaderFooter;
+import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
+import org.apache.poi.xssf.eventusermodel.XSSFReader;
+import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
+import 
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.poi.xssf.model.CommentsTable;
+import org.apache.poi.xssf.model.StylesTable;
+import org.apache.poi.xssf.usermodel.XSSFComment;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
+import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
+    /**
+     * Allows access to headers/footers from raw xml strings
+     */
+    private static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
+    private final XSSFEventBasedExcelExtractor extractor;
+    private final DataFormatter formatter;
+    private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
+    private Metadata metadata;
+
+    public XSSFExcelExtractorDecorator(
+            ParseContext context, XSSFEventBasedExcelExtractor extractor, 
Locale locale) {
+        super(context, extractor);
+
+        this.extractor = extractor;
+        extractor.setFormulasNotResults(false);
+        extractor.setLocale(locale);
+
+        if (locale == null) {
+            formatter = new DataFormatter();
+        } else {
+            formatter = new DataFormatter(locale);
+        }
+    }
+
+    @Override
+    public void getXHTML(
+            ContentHandler handler, Metadata metadata, ParseContext context)
+            throws SAXException, XmlException, IOException, TikaException {
+
+        this.metadata = metadata;
+        metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+        super.getXHTML(handler, metadata, context);
+    }
+
+    /**
+     * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
+     */
+    @Override
+    protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+            XmlException, IOException {
+        OPCPackage container = extractor.getPackage();
+
+        ReadOnlySharedStringsTable strings;
+        XSSFReader.SheetIterator iter;
+        XSSFReader xssfReader;
+        StylesTable styles;
+        try {
+            xssfReader = new XSSFReader(container);
+            styles = xssfReader.getStylesTable();
+            iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+            strings = new ReadOnlySharedStringsTable(container);
+        } catch (InvalidFormatException e) {
+            throw new XmlException(e);
+        } catch (OpenXML4JException oe) {
+            throw new XmlException(oe);
+        }
+
+        while (iter.hasNext()) {
+            InputStream stream = iter.next();
+            sheetParts.add(iter.getSheetPart());
+
+            SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
+            CommentsTable comments = iter.getSheetComments();
+
+            // Start, and output the sheet name
+            xhtml.startElement("div");
+            xhtml.element("h1", iter.getSheetName());
+
+            // Extract the main sheet contents
+            xhtml.startElement("table");
+            xhtml.startElement("tbody");
+
+            processSheet(sheetExtractor, comments, styles, strings, stream);
+
+            xhtml.endElement("tbody");
+            xhtml.endElement("table");
+
+            // Output any headers and footers
+            // (Need to process the sheet to get them, so we can't
+            //  do the headers before the contents)
+            for (String header : sheetExtractor.headers) {
+                extractHeaderFooter(header, xhtml);
+            }
+            for (String footer : sheetExtractor.footers) {
+                extractHeaderFooter(footer, xhtml);
+            }
+            processShapes(iter.getShapes(), xhtml);
+            // All done with this sheet
+            xhtml.endElement("div");
+        }
+    }
+
+    private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
+            throws SAXException {
+        String content = ExcelExtractor._extractHeaderFooter(
+                new HeaderFooterFromString(hf));
+        if (content.length() > 0) {
+            xhtml.element("p", content);
+        }
+    }
+
+    private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler 
xhtml) throws SAXException {
+        if (shapes == null) {
+            return;
+        }
+        for (XSSFShape shape : shapes) {
+            if (shape instanceof XSSFSimpleShape) {
+                String sText = ((XSSFSimpleShape) shape).getText();
+                if (sText != null && sText.length() > 0) {
+                    xhtml.element("p", sText);
+                }
+            }
+        }
+    }
+
+    public void processSheet(
+            SheetContentsHandler sheetContentsExtractor,
+            CommentsTable comments,
+            StylesTable styles,
+            ReadOnlySharedStringsTable strings,
+            InputStream sheetInputStream)
+            throws IOException, SAXException {
+        InputSource sheetSource = new InputSource(sheetInputStream);
+        SAXParserFactory saxFactory = SAXParserFactory.newInstance();
+        try {
+            SAXParser saxParser = saxFactory.newSAXParser();
+            XMLReader sheetParser = saxParser.getXMLReader();
+            XSSFSheetInterestingPartsCapturer handler =
+                    new XSSFSheetInterestingPartsCapturer(new 
XSSFSheetXMLHandler(
+                            styles, comments, strings, sheetContentsExtractor, 
formatter, false));
+            sheetParser.setContentHandler(handler);
+            sheetParser.parse(sheetSource);
+            sheetInputStream.close();
+
+            if (handler.hasProtection) {
+                metadata.set(TikaMetadataKeys.PROTECTED, "true");
+            }
+        } catch (ParserConfigurationException e) {
+            throw new RuntimeException("SAX parser appears to be broken - " + 
e.getMessage());
+        }
+    }
+
+    /**
+     * In Excel files, sheets have things embedded in them,
+     * and sheet drawings which have the images
+     */
+    @Override
+    protected List<PackagePart> getMainDocumentParts() throws TikaException {
+        List<PackagePart> parts = new ArrayList<PackagePart>();
+        for (PackagePart part : sheetParts) {
+            // Add the sheet
+            parts.add(part);
+
+            // If it has drawings, return those too
+            try {
+                for (PackageRelationship rel : 
part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
+                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = 
PackagingURIHelper.createPartName(rel.getTargetURI());
+                        parts.add(rel.getPackage().getPart(relName));
+                    }
+                }
+                for (PackageRelationship rel : 
part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
+                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
+                        PackagePartName relName = 
PackagingURIHelper.createPartName(rel.getTargetURI());
+                        parts.add(rel.getPackage().getPart(relName));
+                    }
+                }
+            } catch (InvalidFormatException e) {
+                throw new TikaException("Broken OOXML file", e);
+            }
+        }
+
+        return parts;
+    }
+
+    /**
+     * Turns formatted sheet events into HTML
+     */
+    protected static class SheetTextAsHTML implements SheetContentsHandler {
+        private XHTMLContentHandler xhtml;
+        private List<String> headers;
+        private List<String> footers;
+
+        protected SheetTextAsHTML(XHTMLContentHandler xhtml) {
+            this.xhtml = xhtml;
+            headers = new ArrayList<String>();
+            footers = new ArrayList<String>();
+        }
+
+        public void startRow(int rowNum) {
+            try {
+                xhtml.startElement("tr");
+            } catch (SAXException e) {
+            }
+        }
+
+        public void endRow(int rowNum) {
+            try {
+                xhtml.endElement("tr");
+            } catch (SAXException e) {
+            }
+        }
+
+        public void cell(String cellRef, String formattedValue, XSSFComment 
comment) {
+            try {
+                xhtml.startElement("td");
+
+                // Main cell contents
+                if (formattedValue != null) {
+                    xhtml.characters(formattedValue);
+                }
+
+                // Comments
+                if (comment != null) {
+                    xhtml.startElement("br");
+                    xhtml.endElement("br");
+                    xhtml.characters(comment.getAuthor());
+                    xhtml.characters(": ");
+                    xhtml.characters(comment.getString().getString());
+                }
+
+                xhtml.endElement("td");
+            } catch (SAXException e) {
+            }
+        }
+
+        public void headerFooter(String text, boolean isHeader, String 
tagName) {
+            if (isHeader) {
+                headers.add(text);
+            } else {
+                footers.add(text);
+            }
+        }
+    }
+
+    protected static class HeaderFooterFromString implements HeaderFooter {
+        private String text;
+
+        protected HeaderFooterFromString(String text) {
+            this.text = text;
+        }
+
+        public String getCenter() {
+            return hfHelper.getCenterSection(text);
+        }
+
+        public void setCenter(String paramString) {
+        }
+
+        public String getLeft() {
+            return hfHelper.getLeftSection(text);
+        }
+
+        public void setLeft(String paramString) {
+        }
+
+        public String getRight() {
+            return hfHelper.getRightSection(text);
+        }
+
+        public void setRight(String paramString) {
+        }
+    }
+
+    /**
+     * Captures information on interesting tags, whilst
+     * delegating the main work to the formatting handler
+     */
+    protected static class XSSFSheetInterestingPartsCapturer implements 
ContentHandler {
+        private ContentHandler delegate;
+        private boolean hasProtection = false;
+
+        protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
+            this.delegate = delegate;
+        }
+
+        public void startElement(String uri, String localName, String qName,
+                                 Attributes atts) throws SAXException {
+            if ("sheetProtection".equals(qName)) {
+                hasProtection = true;
+            }
+            delegate.startElement(uri, localName, qName, atts);
+        }
+
+        public void characters(char[] ch, int start, int length)
+                throws SAXException {
+            delegate.characters(ch, start, length);
+        }
+
+        public void endDocument() throws SAXException {
+            delegate.endDocument();
+        }
+
+        public void endElement(String uri, String localName, String qName)
+                throws SAXException {
+            delegate.endElement(uri, localName, qName);
+        }
+
+        public void endPrefixMapping(String prefix) throws SAXException {
+            delegate.endPrefixMapping(prefix);
+        }
+
+        public void ignorableWhitespace(char[] ch, int start, int length)
+                throws SAXException {
+            delegate.ignorableWhitespace(ch, start, length);
+        }
+
+        public void processingInstruction(String target, String data)
+                throws SAXException {
+            delegate.processingInstruction(target, data);
+        }
+
+        public void setDocumentLocator(Locator locator) {
+            delegate.setDocumentLocator(locator);
+        }
+
+        public void skippedEntity(String name) throws SAXException {
+            delegate.skippedEntity(name);
+        }
+
+        public void startDocument() throws SAXException {
+            delegate.startDocument();
+        }
+
+        public void startPrefixMapping(String prefix, String uri)
+                throws SAXException {
+            delegate.startPrefixMapping(prefix, uri);
+        }
+    }
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFNum;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.tika.parser.microsoft.AbstractListManager;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNum;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl;
+
+
+public class XWPFListManager extends AbstractListManager {
+    private final static boolean OVERRIDE_AVAILABLE;
+    private final static String SKIP_FORMAT = Character.toString((char) 
61623);//if this shows up as the lvlText, don't show a number
+
+    static {
+        boolean b = false;
+        try {
+            
Class.forName("org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl");
+            b = true;
+        } catch (ClassNotFoundException e) {
+        }
+        b = OVERRIDE_AVAILABLE = false;
+
+    }
+
+    private final XWPFNumbering numbering;
+
+    //map of numId (which paragraph series is this a member of?), levelcounts
+    public XWPFListManager(XWPFDocument document) {
+        numbering = document.getNumbering();
+    }
+
+    /**
+     *
+     * @param paragraph paragraph
+     * @return the formatted number or an empty string if something went wrong
+     */
+    public String getFormattedNumber(final XWPFParagraph paragraph) {
+        int currNumId = paragraph.getNumID().intValue();
+        XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID());
+        if (xwpfNum == null) {
+            return "";
+        }
+        CTNum ctNum = xwpfNum.getCTNum();
+        CTDecimalNumber abNum = ctNum.getAbstractNumId();
+        int currAbNumId = abNum.getVal().intValue();
+
+        ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+        LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+        if (lc == null) {
+            lc = loadLevelTuples(abNum);
+        }
+        if (overrideTuples == null) {
+            overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels());
+        }
+
+        String formattedString = 
lc.incrementLevel(paragraph.getNumIlvl().intValue(), overrideTuples);
+
+        listLevelMap.put(currAbNumId, lc);
+        overrideTupleMap.put(currNumId, overrideTuples);
+
+        return formattedString;
+    }
+    
+    private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) {
+        LevelTuple[] levelTuples = new LevelTuple[length];
+        int overrideLength = ctNum.sizeOfLvlOverrideArray();
+        if (overrideLength == 0) {
+            return null;
+        }
+        for (int i = 0; i < length; i++) {
+            LevelTuple tuple;
+            if (i >= overrideLength) {
+                tuple = new LevelTuple("%"+i+".");
+            } else {
+                CTNumLvl ctNumLvl = ctNum.getLvlOverrideArray(i);
+                if (ctNumLvl != null) {
+                    tuple = buildTuple(i, ctNumLvl.getLvl());
+                } else {
+                    tuple = new LevelTuple("%"+i+".");
+                }
+            }
+            levelTuples[i] = tuple;
+        }
+        return levelTuples;
+    }
+
+
+    private ParagraphLevelCounter loadLevelTuples(CTDecimalNumber abNum) {
+        //Unfortunately, we need to go this far into the underlying structure
+        //to get the abstract num information for the edge case where
+        //someone skips a level and the format is not context-free, e.g. 
"1.B.i".
+        XWPFAbstractNum abstractNum = numbering.getAbstractNum(abNum.getVal());
+        CTAbstractNum ctAbstractNum = abstractNum.getCTAbstractNum();
+
+        LevelTuple[] levels = new LevelTuple[ctAbstractNum.sizeOfLvlArray()];
+        for (int i = 0; i < levels.length; i++) {
+            levels[i] = buildTuple(i, ctAbstractNum.getLvlArray(i));
+        }
+        return new ParagraphLevelCounter(levels);
+    }
+
+    private LevelTuple buildTuple(int level, CTLvl ctLvl) {
+        boolean isLegal = false;
+        int start = 1;
+        int restart = -1;
+        String lvlText = "%" + level + ".";
+        String numFmt = "decimal";
+
+
+        if (ctLvl != null && ctLvl.getIsLgl() != null) {
+            isLegal = true;
+        }
+
+        if (ctLvl != null && ctLvl.getNumFmt() != null &&
+                ctLvl.getNumFmt().getVal() != null) {
+            numFmt = ctLvl.getNumFmt().getVal().toString();
+        }
+        if (ctLvl != null && ctLvl.getLvlRestart() != null &&
+                ctLvl.getLvlRestart().getVal() != null) {
+            restart = ctLvl.getLvlRestart().getVal().intValue();
+        }
+        if (ctLvl != null && ctLvl.getStart() != null &&
+                ctLvl.getStart().getVal() != null) {
+            start = ctLvl.getStart().getVal().intValue();
+        } else {
+
+            //this is a hack. Currently, this gets the lowest possible
+            //start for a given numFmt.  We should probably try to grab the
+            //restartNumberingAfterBreak value in
+            //e.g. <w:abstractNum w:abstractNumId="12" 
w15:restartNumberingAfterBreak="0">???
+            if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) || 
"decimalZero".equals(numFmt)) {
+                start = 0;
+            } else {
+                start = 1;
+            }
+        }
+        if (ctLvl != null && ctLvl.getLvlText() != null && 
ctLvl.getLvlText().getVal() != null) {
+            lvlText = ctLvl.getLvlText().getVal();
+        }
+        return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,459 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import javax.xml.namespace.QName;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
+import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
+import org.apache.poi.xwpf.usermodel.BodyType;
+import org.apache.poi.xwpf.usermodel.IBody;
+import org.apache.poi.xwpf.usermodel.IBodyElement;
+import org.apache.poi.xwpf.usermodel.ICell;
+import org.apache.poi.xwpf.usermodel.IRunElement;
+import org.apache.poi.xwpf.usermodel.ISDTContent;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.poi.xwpf.usermodel.XWPFPicture;
+import org.apache.poi.xwpf.usermodel.XWPFPictureData;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
+import org.apache.poi.xwpf.usermodel.XWPFSDT;
+import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
+import org.apache.poi.xwpf.usermodel.XWPFStyle;
+import org.apache.poi.xwpf.usermodel.XWPFStyles;
+import org.apache.poi.xwpf.usermodel.XWPFTable;
+import org.apache.poi.xwpf.usermodel.XWPFTableCell;
+import org.apache.poi.xwpf.usermodel.XWPFTableRow;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.WordExtractor;
+import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlCursor;
+import org.apache.xmlbeans.XmlException;
+import org.apache.xmlbeans.XmlObject;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+
+    // could be improved by using the real delimiter in xchFollow [MS-DOC], 
v20140721, 2.4.6.3, Part 3, Step 3
+    private static final String LIST_DELIMITER = " ";
+
+
+    private XWPFDocument document;
+    private XWPFStyles styles;
+
+    public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor 
extractor) {
+        super(context, extractor);
+
+        document = (XWPFDocument) extractor.getDocument();
+        styles = document.getStyles();
+    }
+
+    /**
+     * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
+     */
+    @Override
+    protected void buildXHTML(XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+        XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
+        XWPFListManager listManager = new XWPFListManager(document);
+        // headers
+        if (hfPolicy != null) {
+            extractHeaders(xhtml, hfPolicy, listManager);
+        }
+
+        // process text in the order that it occurs in
+        extractIBodyText(document, listManager, xhtml);
+
+        // then all document tables
+        if (hfPolicy != null) {
+            extractFooters(xhtml, hfPolicy, listManager);
+        }
+    }
+
+    private void extractIBodyText(IBody bodyElement, XWPFListManager 
listManager,
+                                  XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+        for (IBodyElement element : bodyElement.getBodyElements()) {
+            if (element instanceof XWPFParagraph) {
+                XWPFParagraph paragraph = (XWPFParagraph) element;
+                extractParagraph(paragraph, listManager, xhtml);
+            }
+            if (element instanceof XWPFTable) {
+                XWPFTable table = (XWPFTable) element;
+                extractTable(table, listManager, xhtml);
+            }
+            if (element instanceof XWPFSDT) {
+                extractSDT((XWPFSDT) element, xhtml);
+            }
+
+        }
+    }
+
+    private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws 
SAXException,
+            XmlException, IOException {
+        ISDTContent content = element.getContent();
+        String tag = "p";
+        xhtml.startElement(tag);
+        xhtml.characters(content.getText());
+        xhtml.endElement(tag);
+    }
+
+    private void extractParagraph(XWPFParagraph paragraph, XWPFListManager 
listManager,
+                                  XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+        // If this paragraph is actually a whole new section, then
+        //  it could have its own headers and footers
+        // Check and handle if so
+        XWPFHeaderFooterPolicy headerFooterPolicy = null;
+        if (paragraph.getCTP().getPPr() != null) {
+            CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
+            if (ctSectPr != null) {
+                headerFooterPolicy =
+                        new XWPFHeaderFooterPolicy(document, ctSectPr);
+                extractHeaders(xhtml, headerFooterPolicy, listManager);
+            }
+        }
+
+        // Is this a paragraph, or a heading?
+        String tag = "p";
+        String styleClass = null;
+        if (paragraph.getStyleID() != null) {
+            XWPFStyle style = styles.getStyle(
+                    paragraph.getStyleID()
+            );
+
+            if (style != null && style.getName() != null) {
+                TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
+                        style.getName(), paragraph.getPartType() == 
BodyType.TABLECELL
+                );
+                tag = tas.getTag();
+                styleClass = tas.getStyleClass();
+            }
+        }
+
+        if (styleClass == null) {
+            xhtml.startElement(tag);
+        } else {
+            xhtml.startElement(tag, "class", styleClass);
+        }
+
+        writeParagraphNumber(paragraph, listManager, xhtml);
+        // Output placeholder for any embedded docs:
+
+        // TODO: replace w/ XPath/XQuery:
+        for (XWPFRun run : paragraph.getRuns()) {
+            XmlCursor c = run.getCTR().newCursor();
+            c.selectPath("./*");
+            while (c.toNextSelection()) {
+                XmlObject o = c.getObject();
+                if (o instanceof CTObject) {
+                    XmlCursor c2 = o.newCursor();
+                    c2.selectPath("./*");
+                    while (c2.toNextSelection()) {
+                        XmlObject o2 = c2.getObject();
+
+                        XmlObject embedAtt = o2.selectAttribute(new 
QName("Type"));
+                        if (embedAtt != null && 
embedAtt.getDomNode().getNodeValue().equals("Embed")) {
+                            // Type is "Embed"
+                            XmlObject relIDAtt = o2.selectAttribute(new 
QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships";, 
"id"));
+                            if (relIDAtt != null) {
+                                String relID = 
relIDAtt.getDomNode().getNodeValue();
+                                AttributesImpl attributes = new 
AttributesImpl();
+                                attributes.addAttribute("", "class", "class", 
"CDATA", "embedded");
+                                attributes.addAttribute("", "id", "id", 
"CDATA", relID);
+                                xhtml.startElement("div", attributes);
+                                xhtml.endElement("div");
+                            }
+                        }
+                    }
+                    c2.dispose();
+                }
+            }
+
+            c.dispose();
+        }
+
+        // Attach bookmarks for the paragraph
+        // (In future, we might put them in the right place, for now
+        //  we just put them in the correct paragraph)
+        for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); 
i++) {
+            CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
+            xhtml.startElement("a", "name", bookmark.getName());
+            xhtml.endElement("a");
+        }
+
+        TmpFormatting fmtg = new TmpFormatting(false, false);
+
+        // Do the iruns
+        for (IRunElement run : paragraph.getIRuns()) {
+            if (run instanceof XWPFSDT) {
+                fmtg = closeStyleTags(xhtml, fmtg);
+                processSDTRun((XWPFSDT) run, xhtml);
+                //for now, we're ignoring formatting in sdt
+                //if you hit an sdt reset to false
+                fmtg.setBold(false);
+                fmtg.setItalic(false);
+            } else {
+                fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg);
+            }
+        }
+        closeStyleTags(xhtml, fmtg);
+
+
+        // Now do any comments for the paragraph
+        XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, 
null);
+        String commentText = comments.getCommentText();
+        if (commentText != null && commentText.length() > 0) {
+            xhtml.characters(commentText);
+        }
+
+        String footnameText = paragraph.getFootnoteText();
+        if (footnameText != null && footnameText.length() > 0) {
+            xhtml.characters(footnameText + "\n");
+        }
+
+        // Also extract any paragraphs embedded in text boxes:
+        for (XmlObject embeddedParagraph : 
paragraph.getCTP().selectPath("declare namespace 
w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare 
namespace 
wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' 
.//*/wps:txbx/w:txbxContent/w:p")) {
+            extractParagraph(new 
XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), 
paragraph.getBody()), listManager, xhtml);
+        }
+
+        // Finish this paragraph
+        xhtml.endElement(tag);
+
+        if (headerFooterPolicy != null) {
+            extractFooters(xhtml, headerFooterPolicy, listManager);
+        }
+    }
+
+    private void writeParagraphNumber(XWPFParagraph paragraph,
+                                      XWPFListManager listManager,
+                                      XHTMLContentHandler xhtml) throws 
SAXException {
+        if (paragraph.getNumIlvl() == null) {
+            return;
+        }
+        String number = listManager.getFormattedNumber(paragraph);
+        if (number != null) {
+            xhtml.characters(number);
+        }
+
+    }
+
+    private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml,
+                                         TmpFormatting fmtg) throws 
SAXException {
+        // Close any still open style tags
+        if (fmtg.isItalic()) {
+            xhtml.endElement("i");
+            fmtg.setItalic(false);
+        }
+        if (fmtg.isBold()) {
+            xhtml.endElement("b");
+            fmtg.setBold(false);
+        }
+        return fmtg;
+    }
+
+    private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph,
+                                     XHTMLContentHandler xhtml, TmpFormatting 
tfmtg)
+            throws SAXException, XmlException, IOException {
+        // True if we are currently in the named style tag:
+        if (run.isBold() != tfmtg.isBold()) {
+            if (tfmtg.isItalic()) {
+                xhtml.endElement("i");
+                tfmtg.setItalic(false);
+            }
+            if (run.isBold()) {
+                xhtml.startElement("b");
+            } else {
+                xhtml.endElement("b");
+            }
+            tfmtg.setBold(run.isBold());
+        }
+
+        if (run.isItalic() != tfmtg.isItalic()) {
+            if (run.isItalic()) {
+                xhtml.startElement("i");
+            } else {
+                xhtml.endElement("i");
+            }
+            tfmtg.setItalic(run.isItalic());
+        }
+
+        boolean addedHREF = false;
+        if (run instanceof XWPFHyperlinkRun) {
+            XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun) run;
+            XWPFHyperlink link = linkRun.getHyperlink(document);
+            if (link != null && link.getURL() != null) {
+                xhtml.startElement("a", "href", link.getURL());
+                addedHREF = true;
+            } else if (linkRun.getAnchor() != null && 
linkRun.getAnchor().length() > 0) {
+                xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
+                addedHREF = true;
+            }
+        }
+
+        xhtml.characters(run.toString());
+
+        // If we have any pictures, output them
+        for (XWPFPicture picture : run.getEmbeddedPictures()) {
+            if (paragraph.getDocument() != null) {
+                XWPFPictureData data = picture.getPictureData();
+                if (data != null) {
+                    AttributesImpl attr = new AttributesImpl();
+
+                    attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
data.getFileName());
+                    attr.addAttribute("", "alt", "alt", "CDATA", 
picture.getDescription());
+
+                    xhtml.startElement("img", attr);
+                    xhtml.endElement("img");
+                }
+            }
+        }
+
+        if (addedHREF) {
+            xhtml.endElement("a");
+        }
+
+        return tfmtg;
+    }
+
+    private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+        xhtml.characters(run.getContent().getText());
+    }
+
+    private void extractTable(XWPFTable table, XWPFListManager listManager,
+                              XHTMLContentHandler xhtml)
+            throws SAXException, XmlException, IOException {
+        xhtml.startElement("table");
+        xhtml.startElement("tbody");
+        for (XWPFTableRow row : table.getRows()) {
+            xhtml.startElement("tr");
+            for (ICell cell : row.getTableICells()) {
+                xhtml.startElement("td");
+                if (cell instanceof XWPFTableCell) {
+                    extractIBodyText((XWPFTableCell) cell, listManager, xhtml);
+                } else if (cell instanceof XWPFSDTCell) {
+                    xhtml.characters(((XWPFSDTCell) 
cell).getContent().getText());
+                }
+                xhtml.endElement("td");
+            }
+            xhtml.endElement("tr");
+        }
+        xhtml.endElement("tbody");
+        xhtml.endElement("table");
+    }
+
+    private void extractFooters(
+            XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
+            XWPFListManager listManager)
+            throws SAXException, XmlException, IOException {
+        // footers
+        if (hfPolicy.getFirstPageFooter() != null) {
+            extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(), 
listManager);
+        }
+        if (hfPolicy.getEvenPageFooter() != null) {
+            extractHeaderText(xhtml, hfPolicy.getEvenPageFooter(), 
listManager);
+        }
+        if (hfPolicy.getDefaultFooter() != null) {
+            extractHeaderText(xhtml, hfPolicy.getDefaultFooter(), listManager);
+        }
+    }
+
+    private void extractHeaders(
+            XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, 
XWPFListManager listManager)
+            throws SAXException, XmlException, IOException {
+        if (hfPolicy == null) return;
+
+        if (hfPolicy.getFirstPageHeader() != null) {
+            extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), 
listManager);
+        }
+
+        if (hfPolicy.getEvenPageHeader() != null) {
+            extractHeaderText(xhtml, hfPolicy.getEvenPageHeader(), 
listManager);
+        }
+
+        if (hfPolicy.getDefaultHeader() != null) {
+            extractHeaderText(xhtml, hfPolicy.getDefaultHeader(), listManager);
+        }
+    }
+
+    private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter 
header, XWPFListManager listManager) throws SAXException, XmlException, 
IOException {
+
+        for (IBodyElement e : header.getBodyElements()) {
+            if (e instanceof XWPFParagraph) {
+                extractParagraph((XWPFParagraph) e, listManager, xhtml);
+            } else if (e instanceof XWPFTable) {
+                extractTable((XWPFTable) e, listManager, xhtml);
+            } else if (e instanceof XWPFSDT) {
+                extractSDT((XWPFSDT) e, xhtml);
+            }
+        }
+    }
+
+    /**
+     * Word documents are simple, they only have the one
+     * main part
+     */
+    @Override
+    protected List<PackagePart> getMainDocumentParts() {
+        List<PackagePart> parts = new ArrayList<PackagePart>();
+        parts.add(document.getPackagePart());
+        return parts;
+    }
+
+    private class TmpFormatting {
+        private boolean bold = false;
+        private boolean italic = false;
+
+        private TmpFormatting(boolean bold, boolean italic) {
+            this.bold = bold;
+            this.italic = italic;
+        }
+
+        public boolean isBold() {
+            return bold;
+        }
+
+        public void setBold(boolean bold) {
+            this.bold = bold;
+        }
+
+        public boolean isItalic() {
+            return italic;
+        }
+
+        public void setItalic(boolean italic) {
+            this.italic = italic;
+        }
+
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that:<ul>
+ * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
+ * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
+ * </ul>
+ */
+public class NSNormalizerContentHandler extends ContentHandlerDecorator {
+
+    private static final String OLD_NS =
+            "http://openoffice.org/2000/";;
+
+    private static final String NEW_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:";
+
+    private static final String DTD_PUBLIC_ID =
+            "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
+
+    public NSNormalizerContentHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    private String mapOldNS(String ns) {
+        if (ns != null && ns.startsWith(OLD_NS)) {
+            return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
+        } else {
+            return ns;
+        }
+    }
+
+    @Override
+    public void startElement(
+            String namespaceURI, String localName, String qName,
+            Attributes atts) throws SAXException {
+        AttributesImpl natts = new AttributesImpl();
+        for (int i = 0; i < atts.getLength(); i++) {
+            natts.addAttribute(
+                    mapOldNS(atts.getURI(i)), atts.getLocalName(i),
+                    atts.getQName(i), atts.getType(i), atts.getValue(i));
+        }
+        super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
+    }
+
+    @Override
+    public void endElement(String namespaceURI, String localName, String qName)
+            throws SAXException {
+        super.endElement(mapOldNS(namespaceURI), localName, qName);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri)
+            throws SAXException {
+        super.startPrefixMapping(prefix, mapOldNS(uri));
+    }
+
+    /**
+     * do not load any DTDs (may be requested by parser). Fake the DTD by
+     * returning a empty string as InputSource
+     */
+    @Override
+    public InputSource resolveEntity(String publicId, String systemId)
+            throws IOException, SAXException {
+        if ((systemId != null && 
systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
+                || DTD_PUBLIC_ID.equals(publicId)) {
+            return new InputSource(new StringReader(""));
+        } else {
+            return super.resolveEntity(publicId, systemId);
+        }
+    }
+
+}

Added: 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
URL: 
http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java?rev=1723223&view=auto
==============================================================================
--- 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
 (added)
+++ 
tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
 Wed Jan  6 03:50:50 2016
@@ -0,0 +1,515 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import javax.xml.XMLConstants;
+import javax.xml.namespace.QName;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+/**
+ * Parser for ODF <code>content.xml</code> files.
+ */
+public class OpenDocumentContentParser extends AbstractParser {
+    private interface Style {
+    }
+
+    private static class TextStyle implements Style {
+        public boolean italic;
+        public boolean bold;
+        public boolean underlined;
+    }
+
+    private static class ListStyle implements Style {
+        public boolean ordered;
+
+        public String getTag() {
+            return ordered ? "ol" : "ul";
+        }
+    }
+
+    private static final class OpenDocumentElementMappingContentHandler extends
+            ElementMappingContentHandler {
+        private final ContentHandler handler;
+        private final BitSet textNodeStack = new BitSet();
+        private int nodeDepth = 0;
+        private int completelyFiltered = 0;
+        private Stack<String> headingStack = new Stack<String>();
+        private Map<String, TextStyle> textStyleMap = new HashMap<String, 
TextStyle>();
+        private Map<String, ListStyle> listStyleMap = new HashMap<String, 
ListStyle>();
+        private TextStyle textStyle;
+        private TextStyle lastTextStyle;
+        private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
+        private ListStyle listStyle;
+
+        private OpenDocumentElementMappingContentHandler(ContentHandler 
handler,
+                                                         Map<QName, 
TargetElement> mappings) {
+            super(handler, mappings);
+            this.handler = handler;
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length)
+                throws SAXException {
+            // only forward content of tags from text:-namespace
+            if (completelyFiltered == 0 && nodeDepth > 0
+                    && textNodeStack.get(nodeDepth - 1)) {
+                lazyEndSpan();
+                super.characters(ch, start, length);
+            }
+        }
+
+        // helper for checking tags which need complete filtering
+        // (with sub-tags)
+        private boolean needsCompleteFiltering(
+                String namespaceURI, String localName) {
+            if (TEXT_NS.equals(namespaceURI)) {
+                return localName.endsWith("-template")
+                        || localName.endsWith("-style");
+            }
+            return TABLE_NS.equals(namespaceURI) && 
"covered-table-cell".equals(localName);
+        }
+
+        // map the heading level to <hX> HTML tags
+        private String getXHTMLHeaderTagName(Attributes atts) {
+            String depthStr = atts.getValue(TEXT_NS, "outline-level");
+            if (depthStr == null) {
+                return "h1";
+            }
+
+            int depth = Integer.parseInt(depthStr);
+            if (depth >= 6) {
+                return "h6";
+            } else if (depth <= 1) {
+                return "h1";
+            } else {
+                return "h" + depth;
+            }
+        }
+
+        /**
+         * Check if a node is a text node
+         */
+        private boolean isTextNode(String namespaceURI, String localName) {
+            if (TEXT_NS.equals(namespaceURI) && 
!localName.equals("page-number") && !localName.equals("page-count")) {
+                return true;
+            }
+            if (SVG_NS.equals(namespaceURI)) {
+                return "title".equals(localName) ||
+                        "desc".equals(localName);
+            }
+            return false;
+        }
+
+        private void startList(String name) throws SAXException {
+            String elementName = "ul";
+            if (name != null) {
+                ListStyle style = listStyleMap.get(name);
+                elementName = style != null ? style.getTag() : "ul";
+                listStyleStack.push(style);
+            }
+            handler.startElement(XHTML, elementName, elementName, 
EMPTY_ATTRIBUTES);
+        }
+
+        private void endList() throws SAXException {
+            String elementName = "ul";
+            if (!listStyleStack.isEmpty()) {
+                ListStyle style = listStyleStack.pop();
+                elementName = style != null ? style.getTag() : "ul";
+            }
+            handler.endElement(XHTML, elementName, elementName);
+        }
+
+        private void startSpan(String name) throws SAXException {
+            if (name == null) {
+                return;
+            }
+
+            TextStyle style = textStyleMap.get(name);
+            if (style == null) {
+                return;
+            }
+
+            // End tags that refer to no longer valid styles
+            if (!style.underlined && lastTextStyle != null && 
lastTextStyle.underlined) {
+                handler.endElement(XHTML, "u", "u");
+            }
+            if (!style.italic && lastTextStyle != null && 
lastTextStyle.italic) {
+                handler.endElement(XHTML, "i", "i");
+            }
+            if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
+                handler.endElement(XHTML, "b", "b");
+            }
+
+            // Start tags for new styles
+            if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
+                handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+            }
+            if (style.italic && (lastTextStyle == null || 
!lastTextStyle.italic)) {
+                handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+            }
+            if (style.underlined && (lastTextStyle == null || 
!lastTextStyle.underlined)) {
+                handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+            }
+
+            textStyle = style;
+            lastTextStyle = null;
+        }
+
+        private void endSpan() throws SAXException {
+            lastTextStyle = textStyle;
+            textStyle = null;
+        }
+
+        private void lazyEndSpan() throws SAXException {
+            if (lastTextStyle == null) {
+                return;
+            }
+
+            if (lastTextStyle.underlined) {
+                handler.endElement(XHTML, "u", "u");
+            }
+            if (lastTextStyle.italic) {
+                handler.endElement(XHTML, "i", "i");
+            }
+            if (lastTextStyle.bold) {
+                handler.endElement(XHTML, "b", "b");
+            }
+
+            lastTextStyle = null;
+        }
+
+        @Override
+        public void startElement(
+                String namespaceURI, String localName, String qName,
+                Attributes attrs) throws SAXException {
+            // keep track of current node type. If it is a text node,
+            // a bit at the current depth its set in textNodeStack.
+            // characters() checks the top bit to determine, if the
+            // actual node is a text node to print out nodeDepth contains
+            // the depth of the current node and also marks top of stack.
+            assert nodeDepth >= 0;
+
+            // Set styles
+            if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+                String family = attrs.getValue(STYLE_NS, "family");
+                if ("text".equals(family)) {
+                    textStyle = new TextStyle();
+                    String name = attrs.getValue(STYLE_NS, "name");
+                    textStyleMap.put(name, textStyle);
+                }
+            } else if (TEXT_NS.equals(namespaceURI) && 
"list-style".equals(localName)) {
+                listStyle = new ListStyle();
+                String name = attrs.getValue(STYLE_NS, "name");
+                listStyleMap.put(name, listStyle);
+            } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
+                    && "text-properties".equals(localName)) {
+                String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, 
"font-style");
+                if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) 
{
+                    textStyle.italic = true;
+                }
+                String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, 
"font-weight");
+                if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+                        || (fontWeight != null && 
Character.isDigit(fontWeight.charAt(0))
+                        && Integer.valueOf(fontWeight) > 500)) {
+                    textStyle.bold = true;
+                }
+                String underlineStyle = attrs.getValue(STYLE_NS, 
"text-underline-style");
+                if (underlineStyle != null) {
+                    textStyle.underlined = true;
+                }
+            } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+                if ("list-level-style-bullet".equals(localName)) {
+                    listStyle.ordered = false;
+                } else if ("list-level-style-number".equals(localName)) {
+                    listStyle.ordered = true;
+                }
+            }
+
+            textNodeStack.set(nodeDepth++,
+                    isTextNode(namespaceURI, localName));
+            // filter *all* content of some tags
+            assert completelyFiltered >= 0;
+
+            if (needsCompleteFiltering(namespaceURI, localName)) {
+                completelyFiltered++;
+            }
+            // call next handler if no filtering
+            if (completelyFiltered == 0) {
+                // special handling of text:h, that are directly passed
+                // to incoming handler
+                if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+                    final String el = 
headingStack.push(getXHTMLHeaderTagName(attrs));
+                    handler.startElement(XHTMLContentHandler.XHTML, el, el, 
EMPTY_ATTRIBUTES);
+                } else if (TEXT_NS.equals(namespaceURI) && 
"list".equals(localName)) {
+                    startList(attrs.getValue(TEXT_NS, "style-name"));
+                } else if (TEXT_NS.equals(namespaceURI) && 
"span".equals(localName)) {
+                    startSpan(attrs.getValue(TEXT_NS, "style-name"));
+                } else {
+                    super.startElement(namespaceURI, localName, qName, attrs);
+                }
+            }
+        }
+
+        @Override
+        public void endElement(
+                String namespaceURI, String localName, String qName)
+                throws SAXException {
+            if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+                textStyle = null;
+            } else if (TEXT_NS.equals(namespaceURI) && 
"list-style".equals(localName)) {
+                listStyle = null;
+            }
+
+            // call next handler if no filtering
+            if (completelyFiltered == 0) {
+                // special handling of text:h, that are directly passed
+                // to incoming handler
+                if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+                    final String el = headingStack.pop();
+                    handler.endElement(XHTMLContentHandler.XHTML, el, el);
+                } else if (TEXT_NS.equals(namespaceURI) && 
"list".equals(localName)) {
+                    endList();
+                } else if (TEXT_NS.equals(namespaceURI) && 
"span".equals(localName)) {
+                    endSpan();
+                } else {
+                    if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) 
{
+                        lazyEndSpan();
+                    }
+                    super.endElement(namespaceURI, localName, qName);
+                }
+
+                // special handling of tabulators
+                if (TEXT_NS.equals(namespaceURI)
+                        && ("tab-stop".equals(localName)
+                        || "tab".equals(localName))) {
+                    this.characters(TAB, 0, TAB.length);
+                }
+            }
+
+            // revert filter for *all* content of some tags
+            if (needsCompleteFiltering(namespaceURI, localName)) {
+                completelyFiltered--;
+            }
+            assert completelyFiltered >= 0;
+
+            // reduce current node depth
+            nodeDepth--;
+            assert nodeDepth >= 0;
+        }
+
+        @Override
+        public void startPrefixMapping(String prefix, String uri) {
+            // remove prefix mappings as they should not occur in XHTML
+        }
+
+        @Override
+        public void endPrefixMapping(String prefix) {
+            // remove prefix mappings as they should not occur in XHTML
+        }
+    }
+
+    public static final String TEXT_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+
+    public static final String TABLE_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+
+    public static final String STYLE_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+
+    public static final String FORMATTING_OBJECTS_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+
+    public static final String OFFICE_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
+
+    public static final String SVG_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
+
+    public static final String PRESENTATION_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
+
+    public static final String DRAW_NS =
+            "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
+
+    public static final String XLINK_NS = "http://www.w3.org/1999/xlink";;
+
+    protected static final char[] TAB = new char[]{'\t'};
+
+    private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+    /**
+     * Mappings between ODF tag names and XHTML tag names
+     * (including attributes). All other tag names/attributes are ignored
+     * and left out from event stream.
+     */
+    private static final HashMap<QName, TargetElement> MAPPINGS =
+            new HashMap<QName, TargetElement>();
+
+    static {
+        // general mappings of text:-tags
+        MAPPINGS.put(
+                new QName(TEXT_NS, "p"),
+                new TargetElement(XHTML, "p"));
+        // text:h-tags are mapped specifically in startElement/endElement
+        MAPPINGS.put(
+                new QName(TEXT_NS, "line-break"),
+                new TargetElement(XHTML, "br"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "list-item"),
+                new TargetElement(XHTML, "li"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "note"),
+                new TargetElement(XHTML, "div"));
+        MAPPINGS.put(
+                new QName(OFFICE_NS, "annotation"),
+                new TargetElement(XHTML, "div"));
+        MAPPINGS.put(
+                new QName(PRESENTATION_NS, "notes"),
+                new TargetElement(XHTML, "div"));
+        MAPPINGS.put(
+                new QName(DRAW_NS, "object"),
+                new TargetElement(XHTML, "object"));
+        MAPPINGS.put(
+                new QName(DRAW_NS, "text-box"),
+                new TargetElement(XHTML, "div"));
+        MAPPINGS.put(
+                new QName(SVG_NS, "title"),
+                new TargetElement(XHTML, "span"));
+        MAPPINGS.put(
+                new QName(SVG_NS, "desc"),
+                new TargetElement(XHTML, "span"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "span"),
+                new TargetElement(XHTML, "span"));
+
+        final HashMap<QName, QName> aAttsMapping =
+                new HashMap<QName, QName>();
+        aAttsMapping.put(
+                new QName(XLINK_NS, "href"),
+                new QName("href"));
+        aAttsMapping.put(
+                new QName(XLINK_NS, "title"),
+                new QName("title"));
+        MAPPINGS.put(
+                new QName(TEXT_NS, "a"),
+                new TargetElement(XHTML, "a", aAttsMapping));
+
+        // create HTML tables from table:-tags
+        MAPPINGS.put(
+                new QName(TABLE_NS, "table"),
+                new TargetElement(XHTML, "table"));
+        // repeating of rows is ignored; for columns, see below!
+        MAPPINGS.put(
+                new QName(TABLE_NS, "table-row"),
+                new TargetElement(XHTML, "tr"));
+        // special mapping for rowspan/colspan attributes
+        final HashMap<QName, QName> tableCellAttsMapping =
+                new HashMap<QName, QName>();
+        tableCellAttsMapping.put(
+                new QName(TABLE_NS, "number-columns-spanned"),
+                new QName("colspan"));
+        tableCellAttsMapping.put(
+                new QName(TABLE_NS, "number-rows-spanned"),
+                new QName("rowspan"));
+        /* TODO: The following is not correct, the cell should be repeated not 
spanned!
+         * Code generates a HTML cell, spanning all repeated columns, to make 
the cell look correct.
+         * Problems may occur when both spanning and repeating is given, which 
is not allowed by spec.
+         * Cell spanning instead of repeating  is not a problem, because 
OpenOffice uses it
+         * only for empty cells.
+         */
+        tableCellAttsMapping.put(
+                new QName(TABLE_NS, "number-columns-repeated"),
+                new QName("colspan"));
+        MAPPINGS.put(
+                new QName(TABLE_NS, "table-cell"),
+                new TargetElement(XHTML, "td", tableCellAttsMapping));
+    }
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.emptySet(); // not a top-level parser
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        parseInternal(stream,
+                new XHTMLContentHandler(handler, metadata),
+                metadata, context);
+    }
+
+    void parseInternal(
+            InputStream stream, final ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        DefaultHandler dh = new 
OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
+
+        try {
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            factory.setValidating(false);
+            factory.setNamespaceAware(true);
+            try {
+                factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, 
true);
+            } catch (SAXNotRecognizedException e) {
+                // TIKA-329: Some XML parsers do not support the 
secure-processing
+                // feature, even though it's required by JAXP in Java 5. 
Ignoring
+                // the exception is fine here, deployments without this feature
+                // are inherently vulnerable to XML denial-of-service attacks.
+            }
+            SAXParser parser = factory.newSAXParser();
+            parser.parse(
+                    new CloseShieldInputStream(stream),
+                    new OfflineContentHandler(
+                            new NSNormalizerContentHandler(dh)));
+        } catch (ParserConfigurationException e) {
+            throw new TikaException("XML parser configuration error", e);
+        }
+    }
+
+}

svn commit: r1723223 [12/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Reply via email to