Repository: tika Updated Branches: refs/heads/master 81279a1e0 -> 2031de70c
TIKA-2019 -- clean up -- move state variables to inner classes, convert protected to package private, add @Override on parse Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2031de70 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2031de70 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2031de70 Branch: refs/heads/master Commit: 2031de70c117fdabf793008fe22dd9c97c82d2c9 Parents: 81279a1 Author: tballison <[email protected]> Authored: Fri Jun 24 10:19:59 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Jun 24 10:19:59 2016 -0400 ---------------------------------------------------------------------- .../microsoft/xml/AbstractXML2003Parser.java | 45 ++++++++++---------- .../microsoft/xml/SpreadsheetMLParser.java | 3 +- .../tika/parser/microsoft/xml/WordMLParser.java | 8 ++-- 3 files changed, 28 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/2031de70/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java index 637b4d6..4e05d0e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java @@ -43,31 +43,31 @@ import java.io.InputStream; public abstract class AbstractXML2003Parser extends AbstractParser { - protected final static String MS_OFFICE_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office"; - protected final static String MS_DOC_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office"; - protected final static String MS_SPREADSHEET_URN = "urn:schemas-microsoft-com:office:spreadsheet"; - protected final static String WORD_ML_URL = "http://schemas.microsoft.com/office/word/2003/wordml"; - protected final static Attributes EMPTY_ATTRS = new AttributesImpl(); + final static String MS_OFFICE_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office"; + final static String MS_DOC_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office"; + final static String MS_SPREADSHEET_URN = "urn:schemas-microsoft-com:office:spreadsheet"; + final static String WORD_ML_URL = "http://schemas.microsoft.com/office/word/2003/wordml"; + final static Attributes EMPTY_ATTRS = new AttributesImpl(); - protected final static String DOCUMENT_PROPERTIES = "DocumentProperties"; - protected final static String PICT = "pict"; - protected final static String BIN_DATA = "binData"; + final static String DOCUMENT_PROPERTIES = "DocumentProperties"; + final static String PICT = "pict"; + final static String BIN_DATA = "binData"; - protected final static String A = "a"; - protected final static String BODY = "body"; - protected final static String CDATA = "cdata"; - protected final static String DIV = "div"; - protected final static String HREF = "href"; - protected final static String IMG = "img"; - protected final static String P = "p"; - protected final static String TD = "td"; - protected final static String TR = "tr"; - protected final static String TABLE = "table"; - protected final static String TBODY = "tbody"; + final static String A = "a"; + final static String BODY = "body"; + final static String CDATA = "cdata"; + final static String DIV = "div"; + final static String HREF = "href"; + final static String IMG = "img"; + final static String P = "p"; + final static String TD = "td"; + final static String TR = "tr"; + final static String TABLE = "table"; + final static String TBODY = "tbody"; - protected final static String HLINK = "hlink"; - protected final static String HLINK_DEST = "dest"; - protected final static String NAME_ATTR = "name"; + final static String HLINK = "hlink"; + final static String HLINK_DEST = "dest"; + final static String NAME_ATTR = "name"; private static ContentHandler getMSPropertiesHandler( @@ -77,6 +77,7 @@ public abstract class AbstractXML2003Parser extends AbstractParser { metadata, property); } + @Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) http://git-wip-us.apache.org/repos/asf/tika/blob/2031de70/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java index 0cf7520..c442453 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/SpreadsheetMLParser.java @@ -52,8 +52,6 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser { Collections.unmodifiableSet(new HashSet<>(Arrays.asList( MEDIA_TYPE))); - private boolean inBody = false; - @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; @@ -78,6 +76,7 @@ public class SpreadsheetMLParser extends AbstractXML2003Parser { StringBuilder buffer = new StringBuilder(); String href = null; boolean inData = false; + private boolean inBody = false; public ExcelMLHandler(ContentHandler handler) { this.handler = handler; http://git-wip-us.apache.org/repos/asf/tika/blob/2031de70/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java index 6bd51da..16b8c46 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java @@ -62,7 +62,6 @@ public class WordMLParser extends AbstractXML2003Parser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( MEDIA_TYPE))); - private boolean inBody = false; static { WORDML_TO_XHTML.put(P, P); @@ -107,6 +106,7 @@ public class WordMLParser extends AbstractXML2003Parser { private class WordMLHandler extends DefaultHandler { private final ContentHandler handler; private boolean ignoreCharacters; + private boolean inBody = false; //use inP to keep track of whether the handler is //in a paragraph or not. <p><p></p></p> was allowed @@ -128,7 +128,7 @@ public class WordMLParser extends AbstractXML2003Parser { } String html = WORDML_TO_XHTML.get(localName); if (html != null) { - if ("p".equals(localName)) { + if (P.equals(localName)) { //close p if already in a p to prevent nested <p> if (inP) { handler.endElement(XHTMLContentHandler.XHTML, P, P); @@ -165,13 +165,13 @@ public class WordMLParser extends AbstractXML2003Parser { if (html.equals(TABLE)) { handler.endElement(XHTMLContentHandler.XHTML, TBODY, TBODY); } - if ("p".equals(html) && !inP) { + if (P.equals(html) && !inP) { //start p if not already in one to prevent non-matching <p> handler.startElement(XHTMLContentHandler.XHTML, P, P, EMPTY_ATTRS); } handler.endElement(XHTMLContentHandler.XHTML, html, html); - if ("p".equals(html)) { + if (P.equals(html)) { inP = false; } }
