Repository: tika Updated Branches: refs/heads/2.x 2f5537380 -> 1ce93ed9e
TIKA-2019 -- fix WordMLParser and SpreadsheetMLParser Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/1ce93ed9 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/1ce93ed9 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/1ce93ed9 Branch: refs/heads/2.x Commit: 1ce93ed9ece3b93ff28e532a76ce3b326d734593 Parents: 2f55373 Author: tballison <[email protected]> Authored: Fri Jun 24 10:21:30 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Jun 24 10:21:30 2016 -0400 ---------------------------------------------------------------------- .../microsoft/xml/AbstractXML2003Parser.java | 91 +++++++++++++++----- .../microsoft/xml/SpreadsheetMLParser.java | 42 ++++++--- .../tika/parser/microsoft/xml/WordMLParser.java | 68 +++++++++++---- .../parser/microsoft/xml/XML2003ParserTest.java | 32 ++++++- 4 files changed, 179 insertions(+), 54 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/1ce93ed9/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java index 8fd0648..a12f25e 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java @@ -16,38 +16,57 @@ */ package org.apache.tika.parser.microsoft.xml; -import org.apache.tika.metadata.*; +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; +import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.xml.ElementMetadataHandler; -import org.apache.tika.parser.xml.XMLParser; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.TaggedContentHandler; import org.apache.tika.sax.TeeContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; +import java.io.IOException; +import java.io.InputStream; -public abstract class AbstractXML2003Parser extends XMLParser { +public abstract class AbstractXML2003Parser extends AbstractParser { - protected final static String MS_OFFICE_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office"; - protected final static String MS_DOC_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office"; - protected final static String MS_SPREADSHEET_URN = "urn:schemas-microsoft-com:office:spreadsheet"; - protected final static String WORD_ML_URL = "http://schemas.microsoft.com/office/word/2003/wordml"; - protected final static Attributes EMPTY_ATTRS = new AttributesImpl(); + final static String MS_OFFICE_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office"; + final static String MS_DOC_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office"; + final static String MS_SPREADSHEET_URN = "urn:schemas-microsoft-com:office:spreadsheet"; + final static String WORD_ML_URL = "http://schemas.microsoft.com/office/word/2003/wordml"; + final static Attributes EMPTY_ATTRS = new AttributesImpl(); - protected final static String DOCUMENT_PROPERTIES = "DocumentProperties"; - protected final static String PICT = "pict"; - protected final static String BIN_DATA = "binData"; + final static String DOCUMENT_PROPERTIES = "DocumentProperties"; + final static String PICT = "pict"; + final static String BIN_DATA = "binData"; - protected final static String A = "a"; - protected final static String IMG = "img"; - protected final static String HREF = "href"; - protected final static String CDATA = "cdata"; - protected final static String TABLE = "table"; - protected final static String TBODY = "tbody"; + final static String A = "a"; + final static String BODY = "body"; + final static String CDATA = "cdata"; + final static String DIV = "div"; + final static String HREF = "href"; + final static String IMG = "img"; + final static String P = "p"; + final static String TD = "td"; + final static String TR = "tr"; + final static String TABLE = "table"; + final static String TBODY = "tbody"; - protected final static String HLINK = "hlink"; - protected final static String HLINK_DEST = "dest"; - protected final static String NAME_ATTR = "name"; + final static String HLINK = "hlink"; + final static String HLINK_DEST = "dest"; + final static String NAME_ATTR = "name"; private static ContentHandler getMSPropertiesHandler( @@ -58,8 +77,34 @@ public abstract class AbstractXML2003Parser extends XMLParser { } @Override + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + setContentType(metadata); + + final XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + TaggedContentHandler tagged = new TaggedContentHandler(xhtml); + try { + context.getSAXParser().parse( + new CloseShieldInputStream(stream), + new OfflineContentHandler(new EmbeddedContentHandler( + getContentHandler(tagged, metadata, context)))); + } catch (SAXException e) { + tagged.throwIfCauseOf(e); + throw new TikaException("XML parse error", e); + } finally { + xhtml.endDocument(); + } + } + protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) { - ch = new TeeContentHandler( + //ContentHandler is not currently used, but leave that as an option for + //potential future additions + return new TeeContentHandler( getMSPropertiesHandler(md, TikaCoreProperties.TITLE, "Title"), getMSPropertiesHandler(md, TikaCoreProperties.CREATOR, "Author"), getMSPropertiesHandler(md, Office.LAST_AUTHOR, "LastAuthor"), @@ -75,7 +120,7 @@ public abstract class AbstractXML2003Parser extends XMLParser { getMSPropertiesHandler(md, Office.LINE_COUNT, "Lines"), getMSPropertiesHandler(md, Office.PARAGRAPH_COUNT, "Paragraphs"), getMSPropertiesHandler(md, OfficeOpenXMLCore.VERSION, "Version")); - - return ch; } + abstract protected void setContentType(Metadata contentType); + } http://git-wip-us.apache.org/repos/asf/tika/blob/1ce93ed9/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java index b3c42b3..e703f0e 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java @@ -31,6 +31,7 @@ import org.xml.sax.helpers.DefaultHandler; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; +import java.util.Locale; import java.util.Set; /** @@ -41,14 +42,15 @@ import java.util.Set; */ public class SpreadsheetMLParser extends AbstractXML2003Parser { - final static String CELL = "Cell"; - final static String DATA = "Data"; - final static String ROW = "Row"; - final static String WORKSHEET = "Worksheet"; + final static String CELL = "cell"; + final static String DATA = "data"; + final static String ROW = "row"; + final static String WORKSHEET = "worksheet"; - protected static final Set<MediaType> SUPPORTED_TYPES = + private static final MediaType MEDIA_TYPE = MediaType.application("vnd.ms-spreadsheetml"); + private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( - MediaType.application("vnd.ms-spreadsheetml")))); + MEDIA_TYPE))); @Override public Set<MediaType> getSupportedTypes(ParseContext context) { @@ -64,11 +66,17 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser { new ExcelMLHandler(ch)); } + @Override + public void setContentType(Metadata metadata) { + metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString()); + } + private class ExcelMLHandler extends DefaultHandler { final ContentHandler handler; StringBuilder buffer = new StringBuilder(); String href = null; boolean inData = false; + boolean inBody = false; public ExcelMLHandler(ContentHandler handler) { this.handler = handler; @@ -77,9 +85,12 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser { @Override public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException { + localName = localName.toLowerCase(Locale.US); if (MS_SPREADSHEET_URN.equals(uri)) { - if ("Table".equals(localName)) { + if (BODY.equals(localName)) { + inBody = true; + } else if (TABLE.equals(localName)) { handler.startElement(XHTMLContentHandler.XHTML, TABLE, TABLE, EMPTY_ATTRS); handler.startElement(XHTMLContentHandler.XHTML, TBODY, TBODY, EMPTY_ATTRS); } else if (WORKSHEET.equals(localName)) { @@ -91,12 +102,12 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser { NAME_ATTR, CDATA, worksheetName); } - handler.startElement(XHTMLContentHandler.XHTML, "div", "div", xhtmlAttrs); + handler.startElement(XHTMLContentHandler.XHTML, DIV, DIV, xhtmlAttrs); } else if (ROW.equals(localName)) { - handler.startElement(XHTMLContentHandler.XHTML, "tr", "tr", EMPTY_ATTRS); + handler.startElement(XHTMLContentHandler.XHTML, TR, TR, EMPTY_ATTRS); } else if (CELL.equals(localName)) { href = attrs.getValue(MS_SPREADSHEET_URN, "HRef"); - handler.startElement(XHTMLContentHandler.XHTML, "td", "td", EMPTY_ATTRS); + handler.startElement(XHTMLContentHandler.XHTML, TD, TD, EMPTY_ATTRS); } else if (DATA.equals(localName)) { inData = true; } @@ -107,30 +118,33 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser { public void characters(char[] str, int offset, int len) throws SAXException { if (inData) { buffer.append(str, offset, len); + } else if (inBody) { + handler.characters(str, offset, len); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { + localName = localName.toLowerCase(Locale.US); if (MS_SPREADSHEET_URN.equals(uri)) { - if ("Table".equals(localName)) { + if (TABLE.equals(localName)) { handler.endElement(XHTMLContentHandler.XHTML, TBODY, TBODY); handler.endElement(XHTMLContentHandler.XHTML, TABLE, TABLE); } else if (WORKSHEET.equals(localName)) { handler.endElement( XHTMLContentHandler.XHTML, - "div", "div" + DIV, DIV ); } else if (ROW.equals(localName)) { handler.endElement( XHTMLContentHandler.XHTML, - "tr", "tr" + TR, TR ); } else if (CELL.equals(localName)) { handler.endElement( XHTMLContentHandler.XHTML, - "td", "td" + TD, TD ); } else if (DATA.equals(localName)) { if (href != null) { http://git-wip-us.apache.org/repos/asf/tika/blob/1ce93ed9/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java index 79c2043..28b33e4 100644 --- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java +++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java @@ -16,6 +16,16 @@ */ package org.apache.tika.parser.microsoft.xml; +import javax.xml.namespace.QName; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + import org.apache.commons.codec.binary.Base64; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; @@ -31,11 +41,6 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.DefaultHandler; -import javax.xml.namespace.QName; -import java.io.IOException; -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; - /** * Parses wordml 2003 format word files. These are single xml files * that predate ooxml. @@ -53,14 +58,16 @@ public class WordMLParser extends AbstractXML2003Parser { private final static Set<QName> IGNORE_CHARACTERS = Collections.newSetFromMap(new ConcurrentHashMap<QName, Boolean>()); - protected static final Set<MediaType> SUPPORTED_TYPES = + private static final MediaType MEDIA_TYPE = MediaType.application("vnd.ms-wordml"); + private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( - MediaType.application("vnd.ms-wordml")))); + MEDIA_TYPE))); + static { - WORDML_TO_XHTML.put("p", "p"); + WORDML_TO_XHTML.put(P, P); WORDML_TO_XHTML.put("tbl", TABLE); - WORDML_TO_XHTML.put("tr", "tr"); - WORDML_TO_XHTML.put("tc", "td"); + WORDML_TO_XHTML.put(TR, TR); + WORDML_TO_XHTML.put("tc", TD);//not a typo -- table cell -> tc IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, HLINK)); IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, PICT)); @@ -83,7 +90,6 @@ public class WordMLParser extends AbstractXML2003Parser { ex = new ParsingEmbeddedDocumentExtractor(context); } - return new TeeContentHandler( super.getContentHandler(ch, metadata, context), new WordMLHandler(ch), @@ -92,9 +98,20 @@ public class WordMLParser extends AbstractXML2003Parser { new PictHandler(ch, ex)); } + @Override + public void setContentType(Metadata metadata) { + metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString()); + } + private class WordMLHandler extends DefaultHandler { private final ContentHandler handler; private boolean ignoreCharacters; + private boolean inBody = false; + + //use inP to keep track of whether the handler is + //in a paragraph or not. <p><p></p></p> was allowed + //in wordml. Use this boolean to prevent <p> within <p> + private boolean inP; public WordMLHandler(ContentHandler handler) { this.handler = handler; @@ -103,14 +120,27 @@ public class WordMLParser extends AbstractXML2003Parser { @Override public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException { + localName = localName.toLowerCase(Locale.US); if (WORD_ML_URL.equals(uri)) { + if (BODY.equals(localName)) { + inBody = true; + return; + } String html = WORDML_TO_XHTML.get(localName); if (html != null) { + if (P.equals(localName)) { + //close p if already in a p to prevent nested <p> + if (inP) { + handler.endElement(XHTMLContentHandler.XHTML, P, P); + } + inP = true; + } handler.startElement(XHTMLContentHandler.XHTML, html, html, EMPTY_ATTRS); if (html.equals(TABLE)) { handler.startElement(XHTMLContentHandler.XHTML, TBODY, TBODY, EMPTY_ATTRS); } } + } if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) { ignoreCharacters = true; @@ -119,21 +149,31 @@ public class WordMLParser extends AbstractXML2003Parser { @Override public void characters(char[] str , int offset, int len) throws SAXException { - if (!ignoreCharacters) { + if (!ignoreCharacters && inBody) { handler.characters(str, offset, len); } } - @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (WORD_ML_URL.equals(uri)) { + //for now, don't bother checking for end of body...if there's any text + //after the close of body, we should extract it + localName = localName.toLowerCase(Locale.US); String html = WORDML_TO_XHTML.get(localName); if (html != null) { if (html.equals(TABLE)) { handler.endElement(XHTMLContentHandler.XHTML, TBODY, TBODY); } + if (P.equals(html) && !inP) { + //start p if not already in one to prevent non-matching <p> + handler.startElement(XHTMLContentHandler.XHTML, P, P, EMPTY_ATTRS); + } handler.endElement(XHTMLContentHandler.XHTML, html, html); + + if (P.equals(html)) { + inP = false; + } } } if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) { @@ -143,7 +183,6 @@ public class WordMLParser extends AbstractXML2003Parser { } } - private class PictHandler extends DefaultHandler { final StringBuilder buffer = new StringBuilder(); final ContentHandler handler; @@ -221,5 +260,4 @@ public class WordMLParser extends AbstractXML2003Parser { } } } - } http://git-wip-us.apache.org/repos/asf/tika/blob/1ce93ed9/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java index 2641a62..04530ce 100644 --- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java +++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java @@ -17,10 +17,12 @@ package org.apache.tika.parser.microsoft.xml; import org.apache.tika.TikaTest; + import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.RecursiveParserWrapper; import org.junit.Test; @@ -38,9 +40,14 @@ public class XML2003ParserTest extends TikaTest { Metadata m = list.get(0);//container doc String xml = m.get(RecursiveParserWrapper.TIKA_CONTENT); + xml = xml.replaceAll("\\s+", " "); + //make sure that metadata gets dumped to xml + assertContains("<meta name=\"meta:character-count-with-spaces\" content=\"256\"", xml); + //do not allow nested <p> elements + assertContains("<p /> <img href=\"02000003.jpg\" /><p /> <p><img href=\"02000004.jpg\" /></p>", xml); assertContains("<table><tbody>", xml); assertContains("</tbody></table>", xml); - assertContains("<td><p>R1 c1</p></td>", xml); + assertContains("<td><p>R1 c1</p> </td>", xml); assertContains("<a href=\"https://tika.apache.org/\">tika</a>", xml); assertContains("footnote", xml); assertContains("Mycomment", xml); @@ -61,6 +68,18 @@ public class XML2003ParserTest extends TikaTest { assertEquals("2016-04-27T17:49:00Z", m.get(TikaCoreProperties.CREATED)); assertEquals("application/vnd.ms-wordml", m.get(Metadata.CONTENT_TYPE)); + //make sure embedded docs were properly processed + //no image parsers in this package + //assertContains("moscow-birds", + // Arrays.asList(list.get(7).getValues(TikaCoreProperties.KEYWORDS))); + + //check that text is extracted with breaks between elements + String txt = getText(getResourceAsStream("/test-documents/testWORD2003.xml"), new AutoDetectParser()); + txt = txt.replaceAll("\\s+", " "); + assertNotContained("beforeR1", txt); + assertContains("R1 c1 R1 c2", txt); + assertNotContained("footnoteFigure", txt); + assertContains("footnote Figure", txt); assertEquals("image/jpeg", list.get(7).get(Metadata.CONTENT_TYPE)); } @@ -73,10 +92,19 @@ public class XML2003ParserTest extends TikaTest { assertEquals("application/vnd.ms-spreadsheetml", m.get(Metadata.CONTENT_TYPE)); String xml = r.xml; - assertContains("<tr><td>Col1</td><td>Col2</td>", xml); + xml = xml.replaceAll("\\s+", " "); + //confirm metadata was dumped to xml + assertContains("<meta name=\"cp:version\" content=\"16.00\" />", xml); + assertContains("<tr> <td>Col1</td> <td>Col2</td>", xml); assertContains("<td>2016-04-27T00:00:00.000</td>", xml); assertContains("<a href=\"https://tika.apache.org/\">tika_hyperlink</a>", xml); assertContains("<td>5.5</td>", xml); + + //check that text is extracted with breaks between elements + String txt = getText(getResourceAsStream("/test-documents/testEXCEL2003.xml"), new AutoDetectParser()); + txt = txt.replaceAll("\\s+", " "); + assertContains("Col1 Col2 Col3 Col4 string 1 1.10", txt); + } }
