Author: tpalsulich
Date: Mon Apr 13 16:23:49 2015
New Revision: 1673236
URL: http://svn.apache.org/r1673236
Log:
TIKA-1600. Reformat ODF Parser files and move OpenDocumentParserTest tests to
ODFParserTest.
Removed:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/OpenDocumentParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java?rev=1673236&r1=1673235&r2=1673236&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
Mon Apr 13 16:23:49 2015
@@ -36,13 +36,13 @@ import org.xml.sax.helpers.AttributesImp
public class NSNormalizerContentHandler extends ContentHandlerDecorator {
private static final String OLD_NS =
- "http://openoffice.org/2000/";
+ "http://openoffice.org/2000/";
private static final String NEW_NS =
- "urn:oasis:names:tc:opendocument:xmlns:";
+ "urn:oasis:names:tc:opendocument:xmlns:";
private static final String DTD_PUBLIC_ID =
- "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
+ "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
public NSNormalizerContentHandler(ContentHandler handler) {
super(handler);
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java?rev=1673236&r1=1673235&r2=1673236&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
Mon Apr 13 16:23:49 2015
@@ -16,7 +16,11 @@
*/
package org.apache.tika.parser.odf;
-import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+import javax.xml.XMLConstants;
+import javax.xml.namespace.QName;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
import java.io.IOException;
import java.io.InputStream;
@@ -27,12 +31,6 @@ import java.util.Map;
import java.util.Set;
import java.util.Stack;
-import javax.xml.XMLConstants;
-import javax.xml.namespace.QName;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
@@ -50,6 +48,8 @@ import org.xml.sax.SAXNotRecognizedExcep
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
/**
* Parser for ODF <code>content.xml</code> files.
*/
@@ -62,115 +62,115 @@ public class OpenDocumentContentParser e
public boolean bold;
public boolean underlined;
}
-
+
private static class ListStyle implements Style {
public boolean ordered;
-
+
public String getTag() {
return ordered ? "ol" : "ul";
}
}
private static final class OpenDocumentElementMappingContentHandler extends
- ElementMappingContentHandler {
- private final ContentHandler handler;
- private final BitSet textNodeStack = new BitSet();
- private int nodeDepth = 0;
- private int completelyFiltered = 0;
- private Stack<String> headingStack = new Stack<String>();
- private Map<String, TextStyle> textStyleMap = new
HashMap<String, TextStyle>();
+ ElementMappingContentHandler {
+ private final ContentHandler handler;
+ private final BitSet textNodeStack = new BitSet();
+ private int nodeDepth = 0;
+ private int completelyFiltered = 0;
+ private Stack<String> headingStack = new Stack<String>();
+ private Map<String, TextStyle> textStyleMap = new HashMap<String,
TextStyle>();
private Map<String, ListStyle> listStyleMap = new HashMap<String,
ListStyle>();
private TextStyle textStyle;
private TextStyle lastTextStyle;
private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
private ListStyle listStyle;
- private OpenDocumentElementMappingContentHandler(ContentHandler
handler,
- Map<QName, TargetElement> mappings) {
- super(handler, mappings);
- this.handler = handler;
- }
-
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- // only forward content of tags from text:-namespace
- if (completelyFiltered == 0 && nodeDepth > 0
- && textNodeStack.get(nodeDepth - 1)) {
- lazyEndSpan();
- super.characters(ch,start,length);
- }
- }
-
- // helper for checking tags which need complete filtering
- // (with sub-tags)
- private boolean needsCompleteFiltering(
- String namespaceURI, String localName) {
- if (TEXT_NS.equals(namespaceURI)) {
- return localName.endsWith("-template")
- || localName.endsWith("-style");
- }
+ private OpenDocumentElementMappingContentHandler(ContentHandler
handler,
+ Map<QName,
TargetElement> mappings) {
+ super(handler, mappings);
+ this.handler = handler;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ // only forward content of tags from text:-namespace
+ if (completelyFiltered == 0 && nodeDepth > 0
+ && textNodeStack.get(nodeDepth - 1)) {
+ lazyEndSpan();
+ super.characters(ch, start, length);
+ }
+ }
+
+ // helper for checking tags which need complete filtering
+ // (with sub-tags)
+ private boolean needsCompleteFiltering(
+ String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI)) {
+ return localName.endsWith("-template")
+ || localName.endsWith("-style");
+ }
return TABLE_NS.equals(namespaceURI) &&
"covered-table-cell".equals(localName);
- }
+ }
+
+ // map the heading level to <hX> HTML tags
+ private String getXHTMLHeaderTagName(Attributes atts) {
+ String depthStr = atts.getValue(TEXT_NS, "outline-level");
+ if (depthStr == null) {
+ return "h1";
+ }
+
+ int depth = Integer.parseInt(depthStr);
+ if (depth >= 6) {
+ return "h6";
+ } else if (depth <= 1) {
+ return "h1";
+ } else {
+ return "h" + depth;
+ }
+ }
- // map the heading level to <hX> HTML tags
- private String getXHTMLHeaderTagName(Attributes atts) {
- String depthStr = atts.getValue(TEXT_NS, "outline-level");
- if (depthStr == null) {
- return "h1";
- }
-
- int depth = Integer.parseInt(depthStr);
- if (depth >= 6) {
- return "h6";
- } else if (depth <= 1) {
- return "h1";
- } else {
- return "h" + depth;
- }
- }
-
- /**
- * Check if a node is a text node
- */
- private boolean isTextNode(String namespaceURI, String
localName) {
- if (TEXT_NS.equals(namespaceURI) &&
!localName.equals("page-number") && !localName.equals("page-count")) {
- return true;
- }
- if (SVG_NS.equals(namespaceURI)) {
- return "title".equals(localName) ||
- "desc".equals(localName);
- }
- return false;
- }
-
- private void startList(String name) throws SAXException {
- String elementName = "ul";
- if (name != null) {
- ListStyle style = listStyleMap.get(name);
- elementName = style != null ? style.getTag() : "ul";
- listStyleStack.push(style);
- }
+ /**
+ * Check if a node is a text node
+ */
+ private boolean isTextNode(String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI) &&
!localName.equals("page-number") && !localName.equals("page-count")) {
+ return true;
+ }
+ if (SVG_NS.equals(namespaceURI)) {
+ return "title".equals(localName) ||
+ "desc".equals(localName);
+ }
+ return false;
+ }
+
+ private void startList(String name) throws SAXException {
+ String elementName = "ul";
+ if (name != null) {
+ ListStyle style = listStyleMap.get(name);
+ elementName = style != null ? style.getTag() : "ul";
+ listStyleStack.push(style);
+ }
handler.startElement(XHTML, elementName, elementName,
EMPTY_ATTRIBUTES);
- }
+ }
- private void endList() throws SAXException {
+ private void endList() throws SAXException {
String elementName = "ul";
if (!listStyleStack.isEmpty()) {
ListStyle style = listStyleStack.pop();
elementName = style != null ? style.getTag() : "ul";
}
handler.endElement(XHTML, elementName, elementName);
- }
+ }
- private void startSpan(String name) throws SAXException {
- if (name == null) {
- return;
- }
+ private void startSpan(String name) throws SAXException {
+ if (name == null) {
+ return;
+ }
TextStyle style = textStyleMap.get(name);
if (style == null) {
- return;
+ return;
}
// End tags that refer to no longer valid styles
@@ -197,17 +197,17 @@ public class OpenDocumentContentParser e
textStyle = style;
lastTextStyle = null;
- }
+ }
- private void endSpan() throws SAXException {
- lastTextStyle = textStyle;
- textStyle = null;
- }
-
- private void lazyEndSpan() throws SAXException {
- if (lastTextStyle == null) {
- return;
- }
+ private void endSpan() throws SAXException {
+ lastTextStyle = textStyle;
+ textStyle = null;
+ }
+
+ private void lazyEndSpan() throws SAXException {
+ if (lastTextStyle == null) {
+ return;
+ }
if (lastTextStyle.underlined) {
handler.endElement(XHTML, "u", "u");
@@ -220,175 +220,175 @@ public class OpenDocumentContentParser e
}
lastTextStyle = null;
- }
+ }
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes attrs) throws SAXException {
+ // keep track of current node type. If it is a text node,
+ // a bit at the current depth its set in textNodeStack.
+ // characters() checks the top bit to determine, if the
+ // actual node is a text node to print out nodeDepth contains
+ // the depth of the current node and also marks top of stack.
+ assert nodeDepth >= 0;
- @Override
- public void startElement(
- String namespaceURI, String localName, String qName,
- Attributes attrs) throws SAXException {
- // keep track of current node type. If it is a text node,
- // a bit at the current depth its set in textNodeStack.
- // characters() checks the top bit to determine, if the
- // actual node is a text node to print out nodeDepth
contains
- // the depth of the current node and also marks top of
stack.
- assert nodeDepth >= 0;
-
- // Set styles
- if (STYLE_NS.equals(namespaceURI) &&
"style".equals(localName)) {
- String family = attrs.getValue(STYLE_NS, "family");
- if ("text".equals(family)) {
- textStyle = new TextStyle();
- String name = attrs.getValue(STYLE_NS, "name");
- textStyleMap.put(name, textStyle);
- }
- } else if (TEXT_NS.equals(namespaceURI) &&
"list-style".equals(localName)) {
- listStyle = new ListStyle();
+ // Set styles
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ String family = attrs.getValue(STYLE_NS, "family");
+ if ("text".equals(family)) {
+ textStyle = new TextStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ textStyleMap.put(name, textStyle);
+ }
+ } else if (TEXT_NS.equals(namespaceURI) &&
"list-style".equals(localName)) {
+ listStyle = new ListStyle();
String name = attrs.getValue(STYLE_NS, "name");
listStyleMap.put(name, listStyle);
- } else if (textStyle != null &&
STYLE_NS.equals(namespaceURI)
- && "text-properties".equals(localName)) {
- String fontStyle =
attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
- if ("italic".equals(fontStyle) ||
"oblique".equals(fontStyle)) {
- textStyle.italic = true;
- }
- String fontWeight =
attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
- if ("bold".equals(fontWeight) ||
"bolder".equals(fontWeight)
- || (fontWeight!=null &&
Character.isDigit(fontWeight.charAt(0))
- && Integer.valueOf(fontWeight) > 500)) {
- textStyle.bold = true;
- }
- String underlineStyle = attrs.getValue(STYLE_NS,
"text-underline-style");
- if (underlineStyle != null) {
- textStyle.underlined = true;
- }
- } else if (listStyle != null &&
TEXT_NS.equals(namespaceURI)) {
- if ("list-level-style-bullet".equals(localName)) {
- listStyle.ordered = false;
- } else if ("list-level-style-number".equals(localName))
{
- listStyle.ordered = true;
- }
- }
-
- textNodeStack.set(nodeDepth++,
- isTextNode(namespaceURI, localName));
- // filter *all* content of some tags
- assert completelyFiltered >= 0;
-
- if (needsCompleteFiltering(namespaceURI, localName)) {
- completelyFiltered++;
- }
- // call next handler if no filtering
- if (completelyFiltered == 0) {
- // special handling of text:h, that are directly passed
- // to incoming handler
- if (TEXT_NS.equals(namespaceURI) &&
"h".equals(localName)) {
- final String el =
headingStack.push(getXHTMLHeaderTagName(attrs));
- handler.startElement(XHTMLContentHandler.XHTML, el,
el, EMPTY_ATTRIBUTES);
- } else if (TEXT_NS.equals(namespaceURI) &&
"list".equals(localName)) {
- startList(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
+ && "text-properties".equals(localName)) {
+ String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS,
"font-style");
+ if ("italic".equals(fontStyle) || "oblique".equals(fontStyle))
{
+ textStyle.italic = true;
+ }
+ String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS,
"font-weight");
+ if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+ || (fontWeight != null &&
Character.isDigit(fontWeight.charAt(0))
+ && Integer.valueOf(fontWeight) > 500)) {
+ textStyle.bold = true;
+ }
+ String underlineStyle = attrs.getValue(STYLE_NS,
"text-underline-style");
+ if (underlineStyle != null) {
+ textStyle.underlined = true;
+ }
+ } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+ if ("list-level-style-bullet".equals(localName)) {
+ listStyle.ordered = false;
+ } else if ("list-level-style-number".equals(localName)) {
+ listStyle.ordered = true;
+ }
+ }
+
+ textNodeStack.set(nodeDepth++,
+ isTextNode(namespaceURI, localName));
+ // filter *all* content of some tags
+ assert completelyFiltered >= 0;
+
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered++;
+ }
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el =
headingStack.push(getXHTMLHeaderTagName(attrs));
+ handler.startElement(XHTMLContentHandler.XHTML, el, el,
EMPTY_ATTRIBUTES);
+ } else if (TEXT_NS.equals(namespaceURI) &&
"list".equals(localName)) {
+ startList(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) &&
"span".equals(localName)) {
startSpan(attrs.getValue(TEXT_NS, "style-name"));
- } else {
- super.startElement(namespaceURI, localName, qName,
attrs);
- }
- }
- }
-
- @Override
- public void endElement(
- String namespaceURI, String localName, String qName)
- throws SAXException {
+ } else {
+ super.startElement(namespaceURI, localName, qName, attrs);
+ }
+ }
+ }
+
+ @Override
+ public void endElement(
+ String namespaceURI, String localName, String qName)
+ throws SAXException {
if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
textStyle = null;
} else if (TEXT_NS.equals(namespaceURI) &&
"list-style".equals(localName)) {
listStyle = null;
}
- // call next handler if no filtering
- if (completelyFiltered == 0) {
- // special handling of text:h, that are directly passed
- // to incoming handler
- if (TEXT_NS.equals(namespaceURI) &&
"h".equals(localName)) {
- final String el = headingStack.pop();
- handler.endElement(XHTMLContentHandler.XHTML, el,
el);
- } else if (TEXT_NS.equals(namespaceURI) &&
"list".equals(localName)) {
- endList();
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.pop();
+ handler.endElement(XHTMLContentHandler.XHTML, el, el);
+ } else if (TEXT_NS.equals(namespaceURI) &&
"list".equals(localName)) {
+ endList();
} else if (TEXT_NS.equals(namespaceURI) &&
"span".equals(localName)) {
endSpan();
- } else {
- if (TEXT_NS.equals(namespaceURI) &&
"p".equals(localName)) {
- lazyEndSpan();
- }
- super.endElement(namespaceURI,localName,qName);
- }
-
- // special handling of tabulators
- if (TEXT_NS.equals(namespaceURI)
- && ("tab-stop".equals(localName)
- || "tab".equals(localName))) {
- this.characters(TAB, 0, TAB.length);
- }
- }
-
- // revert filter for *all* content of some tags
- if (needsCompleteFiltering(namespaceURI,localName)) {
- completelyFiltered--;
- }
- assert completelyFiltered >= 0;
-
- // reduce current node depth
- nodeDepth--;
- assert nodeDepth >= 0;
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) {
- // remove prefix mappings as they should not occur in XHTML
- }
-
- @Override
- public void endPrefixMapping(String prefix) {
- // remove prefix mappings as they should not occur in XHTML
- }
- }
+ } else {
+ if (TEXT_NS.equals(namespaceURI) && "p".equals(localName))
{
+ lazyEndSpan();
+ }
+ super.endElement(namespaceURI, localName, qName);
+ }
+
+ // special handling of tabulators
+ if (TEXT_NS.equals(namespaceURI)
+ && ("tab-stop".equals(localName)
+ || "tab".equals(localName))) {
+ this.characters(TAB, 0, TAB.length);
+ }
+ }
+
+ // revert filter for *all* content of some tags
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered--;
+ }
+ assert completelyFiltered >= 0;
+
+ // reduce current node depth
+ nodeDepth--;
+ assert nodeDepth >= 0;
+ }
- public static final String TEXT_NS =
- "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+ }
+
+ public static final String TEXT_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
public static final String TABLE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+ "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
public static final String STYLE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+ "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
public static final String FORMATTING_OBJECTS_NS =
- "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+ "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
public static final String OFFICE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
+ "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
public static final String SVG_NS =
- "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
+ "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
public static final String PRESENTATION_NS =
- "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
+ "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
public static final String DRAW_NS =
- "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
+ "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
- protected static final char[] TAB = new char[] { '\t' };
+ protected static final char[] TAB = new char[]{'\t'};
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
/**
* Mappings between ODF tag names and XHTML tag names
* (including attributes). All other tag names/attributes are ignored
- * and left out from event stream.
+ * and left out from event stream.
*/
private static final HashMap<QName, TargetElement> MAPPINGS =
- new HashMap<QName, TargetElement>();
+ new HashMap<QName, TargetElement>();
static {
// general mappings of text:-tags
@@ -426,9 +426,9 @@ public class OpenDocumentContentParser e
MAPPINGS.put(
new QName(TEXT_NS, "span"),
new TargetElement(XHTML, "span"));
-
- final HashMap<QName,QName> aAttsMapping =
- new HashMap<QName,QName>();
+
+ final HashMap<QName, QName> aAttsMapping =
+ new HashMap<QName, QName>();
aAttsMapping.put(
new QName(XLINK_NS, "href"),
new QName("href"));
@@ -448,8 +448,8 @@ public class OpenDocumentContentParser e
new QName(TABLE_NS, "table-row"),
new TargetElement(XHTML, "tr"));
// special mapping for rowspan/colspan attributes
- final HashMap<QName,QName> tableCellAttsMapping =
- new HashMap<QName,QName>();
+ final HashMap<QName, QName> tableCellAttsMapping =
+ new HashMap<QName, QName>();
tableCellAttsMapping.put(
new QName(TABLE_NS, "number-columns-spanned"),
new QName("colspan"));
@@ -479,8 +479,8 @@ public class OpenDocumentContentParser e
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
parseInternal(stream,
- new XHTMLContentHandler(handler,metadata),
- metadata, context);
+ new XHTMLContentHandler(handler, metadata),
+ metadata, context);
}
void parseInternal(
@@ -496,7 +496,7 @@ public class OpenDocumentContentParser e
factory.setNamespaceAware(true);
try {
factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING,
true);
- } catch (SAXNotRecognizedException e){
+ } catch (SAXNotRecognizedException e) {
// TIKA-329: Some XML parsers do not support the
secure-processing
// feature, even though it's required by JAXP in Java 5.
Ignoring
// the exception is fine here, deployments without this feature
@@ -513,4 +513,3 @@ public class OpenDocumentContentParser e
}
}
-
\ No newline at end of file
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java?rev=1673236&r1=1673235&r2=1673236&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
Mon Apr 13 16:23:49 2015
@@ -50,33 +50,33 @@ public class OpenDocumentMetaParser exte
* Serial version UID
*/
private static final long serialVersionUID = -8739250869531737584L;
-
- private static final String META_NS =
"urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+
+ private static final String META_NS =
"urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
private static final XPathParser META_XPATH = new XPathParser("meta",
META_NS);
-
- /**
- * @see OfficeOpenXMLCore#SUBJECT
+
+ /**
+ * @see OfficeOpenXMLCore#SUBJECT
* @deprecated use OfficeOpenXMLCore#SUBJECT
*/
@Deprecated
- private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR
=
- Property.composite(Office.INITIAL_AUTHOR,
- new Property[] { Property.externalText("initial-creator") });
-
+ private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR
=
+ Property.composite(Office.INITIAL_AUTHOR,
+ new Property[]{Property.externalText("initial-creator")});
+
private static ContentHandler getDublinCoreHandler(
Metadata metadata, Property property, String element) {
return new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, element,
metadata, property);
}
-
+
private static ContentHandler getMeta(
ContentHandler ch, Metadata md, Property property, String element)
{
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:" + element),
META_XPATH.parse("//meta:" + element + "//text()"));
ContentHandler branch =
- new MatchingContentHandler(new MetadataHandler(md, property),
matcher);
+ new MatchingContentHandler(new MetadataHandler(md, property),
matcher);
return new TeeContentHandler(ch, branch);
}
@@ -87,27 +87,29 @@ public class OpenDocumentMetaParser exte
META_XPATH.parse("//meta:user-defined//text()"));
// eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined>
becomes custom:Info1=Text1
ContentHandler branch = new MatchingContentHandler(
- new AttributeDependantMetadataHandler(md, "meta:name",
Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
- matcher);
+ new AttributeDependantMetadataHandler(md, "meta:name",
Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
+ matcher);
return new TeeContentHandler(ch, branch);
}
- @Deprecated private static ContentHandler getStatistic(
+ @Deprecated
+ private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, String name, String attribute) {
Matcher matcher =
- META_XPATH.parse("//meta:document-statistic/@meta:"+attribute);
+ META_XPATH.parse("//meta:document-statistic/@meta:" +
attribute);
ContentHandler branch = new MatchingContentHandler(
- new AttributeMetadataHandler(META_NS, attribute, md, name),
matcher);
+ new AttributeMetadataHandler(META_NS, attribute, md, name),
matcher);
return new TeeContentHandler(ch, branch);
}
+
private static ContentHandler getStatistic(
- ContentHandler ch, Metadata md, Property property, String attribute)
{
- Matcher matcher =
- META_XPATH.parse("//meta:document-statistic/@meta:"+attribute);
- ContentHandler branch = new MatchingContentHandler(
- new AttributeMetadataHandler(META_NS, attribute, md, property),
matcher);
- return new TeeContentHandler(ch, branch);
- }
+ ContentHandler ch, Metadata md, Property property, String
attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:" +
attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md,
property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
protected ContentHandler getContentHandler(ContentHandler ch, Metadata md,
ParseContext context) {
// We can no longer extend DcXMLParser due to the handling of
dc:subject and dc:date
@@ -123,48 +125,48 @@ public class OpenDocumentMetaParser exte
getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER,
"identifier"),
getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE,
"language"),
getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
-
+
// Process the OO Meta Attributes
ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
// ODF uses dc:date for modified
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "date",
md, TikaCoreProperties.MODIFIED));
-
+
// ODF uses dc:subject for description
ch = new TeeContentHandler(ch, new ElementMetadataHandler(
DublinCore.NAMESPACE_URI_DC, "subject",
md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
ch = getMeta(ch, md,
TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
-
- ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME),
"editing-duration");
+
+ ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME),
"editing-duration");
ch = getMeta(ch, md, Property.externalText("editing-cycles"),
"editing-cycles");
ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR,
"initial-creator");
ch = getMeta(ch, md, Property.externalText("generator"), "generator");
-
+
// Process the user defined Meta Attributes
ch = getUserDefined(ch, md);
-
+
// Process the OO Statistics Attributes
- ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
- ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
- ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
- ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
- ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
+ ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
- ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
-
+
// Legacy, Tika-1.0 style attributes
// TODO Remove these in Tika 2.0
- ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
- ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
- ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
- ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
- ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
-
+
// Legacy Statistics Attributes, replaced with real keys above
// TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
ch = getStatistic(ch, md, "nbPage", "page-count");
@@ -174,12 +176,12 @@ public class OpenDocumentMetaParser exte
ch = getStatistic(ch, md, "nbTab", "table-count");
ch = getStatistic(ch, md, "nbObject", "object-count");
ch = getStatistic(ch, md, "nbImg", "image-count");
-
+
// Normalise the rest
ch = new NSNormalizerContentHandler(ch);
return ch;
}
-
+
@Override
public void parse(
InputStream stream, ContentHandler handler,
@@ -188,10 +190,10 @@ public class OpenDocumentMetaParser exte
super.parse(stream, handler, metadata, context);
// Copy subject to description for OO2
String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
- if (odfSubject != null && !odfSubject.equals("") &&
+ if (odfSubject != null && !odfSubject.equals("") &&
(metadata.get(TikaCoreProperties.DESCRIPTION) == null ||
metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
}
}
-
+
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java?rev=1673236&r1=1673235&r2=1673236&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
Mon Apr 13 16:23:49 2015
@@ -46,47 +46,49 @@ import org.xml.sax.helpers.DefaultHandle
*/
public class OpenDocumentParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -6410276875438618287L;
private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("vnd.sun.xml.writer"),
- MediaType.application("vnd.oasis.opendocument.text"),
- MediaType.application("vnd.oasis.opendocument.graphics"),
- MediaType.application("vnd.oasis.opendocument.presentation"),
- MediaType.application("vnd.oasis.opendocument.spreadsheet"),
- MediaType.application("vnd.oasis.opendocument.chart"),
- MediaType.application("vnd.oasis.opendocument.image"),
- MediaType.application("vnd.oasis.opendocument.formula"),
- MediaType.application("vnd.oasis.opendocument.text-master"),
- MediaType.application("vnd.oasis.opendocument.text-web"),
- MediaType.application("vnd.oasis.opendocument.text-template"),
-
MediaType.application("vnd.oasis.opendocument.graphics-template"),
-
MediaType.application("vnd.oasis.opendocument.presentation-template"),
-
MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
- MediaType.application("vnd.oasis.opendocument.chart-template"),
- MediaType.application("vnd.oasis.opendocument.image-template"),
-
MediaType.application("vnd.oasis.opendocument.formula-template"),
- MediaType.application("x-vnd.oasis.opendocument.text"),
- MediaType.application("x-vnd.oasis.opendocument.graphics"),
- MediaType.application("x-vnd.oasis.opendocument.presentation"),
- MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
- MediaType.application("x-vnd.oasis.opendocument.chart"),
- MediaType.application("x-vnd.oasis.opendocument.image"),
- MediaType.application("x-vnd.oasis.opendocument.formula"),
- MediaType.application("x-vnd.oasis.opendocument.text-master"),
- MediaType.application("x-vnd.oasis.opendocument.text-web"),
-
MediaType.application("x-vnd.oasis.opendocument.text-template"),
-
MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
-
MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
-
MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
-
MediaType.application("x-vnd.oasis.opendocument.chart-template"),
-
MediaType.application("x-vnd.oasis.opendocument.image-template"),
-
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.sun.xml.writer"),
+ MediaType.application("vnd.oasis.opendocument.text"),
+ MediaType.application("vnd.oasis.opendocument.graphics"),
+
MediaType.application("vnd.oasis.opendocument.presentation"),
+
MediaType.application("vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("vnd.oasis.opendocument.chart"),
+ MediaType.application("vnd.oasis.opendocument.image"),
+ MediaType.application("vnd.oasis.opendocument.formula"),
+
MediaType.application("vnd.oasis.opendocument.text-master"),
+ MediaType.application("vnd.oasis.opendocument.text-web"),
+
MediaType.application("vnd.oasis.opendocument.text-template"),
+
MediaType.application("vnd.oasis.opendocument.graphics-template"),
+
MediaType.application("vnd.oasis.opendocument.presentation-template"),
+
MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
+
MediaType.application("vnd.oasis.opendocument.chart-template"),
+
MediaType.application("vnd.oasis.opendocument.image-template"),
+
MediaType.application("vnd.oasis.opendocument.formula-template"),
+ MediaType.application("x-vnd.oasis.opendocument.text"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics"),
+
MediaType.application("x-vnd.oasis.opendocument.presentation"),
+
MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("x-vnd.oasis.opendocument.chart"),
+ MediaType.application("x-vnd.oasis.opendocument.image"),
+ MediaType.application("x-vnd.oasis.opendocument.formula"),
+
MediaType.application("x-vnd.oasis.opendocument.text-master"),
+ MediaType.application("x-vnd.oasis.opendocument.text-web"),
+
MediaType.application("x-vnd.oasis.opendocument.text-template"),
+
MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
+
MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
+
MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
+
MediaType.application("x-vnd.oasis.opendocument.chart-template"),
+
MediaType.application("x-vnd.oasis.opendocument.image-template"),
+
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
private static final String META_NAME = "meta.xml";
-
+
private Parser meta = new OpenDocumentMetaParser();
private Parser content = new OpenDocumentContentParser();
@@ -126,7 +128,7 @@ public class OpenDocumentParser extends
if (container instanceof ZipFile) {
zipFile = (ZipFile) container;
} else if (tis.hasFile()) {
- zipFile = new ZipFile(tis.getFile());
+ zipFile = new ZipFile(tis.getFile());
} else {
zipStream = new ZipInputStream(stream);
}
@@ -139,9 +141,9 @@ public class OpenDocumentParser extends
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
- EndDocumentShieldingContentHandler handler =
- new EndDocumentShieldingContentHandler(xhtml);
-
+ EndDocumentShieldingContentHandler handler =
+ new EndDocumentShieldingContentHandler(xhtml);
+
// If we can, process the metadata first, then the
// rest of the file afterwards
// Only possible to guarantee that when opened from a file not a stream
@@ -153,7 +155,7 @@ public class OpenDocumentParser extends
Enumeration<? extends ZipEntry> entries = zipFile.entries();
while (entries.hasMoreElements()) {
entry = entries.nextElement();
- if (! META_NAME.equals(entry.getName())) {
+ if (!META_NAME.equals(entry.getName())) {
handleZipEntry(entry, zipFile.getInputStream(entry),
metadata, context, handler);
}
}
@@ -165,18 +167,18 @@ public class OpenDocumentParser extends
} while (entry != null);
zipStream.close();
}
-
+
// Only now call the end document
- if(handler.getEndDocumentWasCalled()) {
- handler.reallyEndDocument();
+ if (handler.getEndDocumentWasCalled()) {
+ handler.reallyEndDocument();
}
}
-
- private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata
metadata,
- ParseContext context, EndDocumentShieldingContentHandler handler)
+
+ private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata
metadata,
+ ParseContext context,
EndDocumentShieldingContentHandler handler)
throws IOException, SAXException, TikaException {
if (entry == null) return;
-
+
if (entry.getName().equals("mimetype")) {
String type = IOUtils.toString(zip, IOUtils.UTF_8.name());
metadata.set(Metadata.CONTENT_TYPE, type);
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1673236&r1=1673235&r2=1673236&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
Mon Apr 13 16:23:49 2015
@@ -374,6 +374,7 @@ public class ODFParserTest extends TikaT
}
}
+ // TIKA-1063: Test basic style support.
@Test
public void testODTStyles() throws Exception {
String xml = getXML("testStyles.odt").xml;
@@ -384,4 +385,27 @@ public class ODFParserTest extends TikaT
assertContains("<ul>\t<li><p>First</p>", xml);
assertContains("</ul>", xml);
}
+
+ //TIKA-1600: Test that null pointer doesn't break parsing.
+ @Test
+ public void testNullStylesInODTFooter() throws Exception {
+ Parser parser = new OpenDocumentParser();
+ InputStream input =
ODFParserTest.class.getResourceAsStream("/test-documents/testODT-TIKA-6000.odt");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(input, handler, metadata, new ParseContext());
+
+ assertEquals("application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+
+ assertContains("Utilisation de ce document", content);
+ assertContains("Copyright and License", content);
+ assertContains("Changer la langue", content);
+ assertContains("La page dâaccueil permet de faire une recherche
simple", content);
+ } finally {
+ input.close();
+ }
+ }
}