Author: tallison
Date: Fri May 29 14:36:21 2015
New Revision: 1682489
URL: http://svn.apache.org/r1682489
Log:
TIKA-1643: clean up code in tika-parsers -- changed all newlines to lf and
autocorrected code for most parsers that I've mis-styled.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/WebPParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/ListDescriptor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/Icu4jEncodingDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/BPGParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/WebPParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OldExcelParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ProjectParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
tika/trunk/tika-server/pom.xml
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
Fri May 29 14:36:21 2015
@@ -22,14 +22,6 @@ import java.util.BitSet;
import java.util.List;
import java.util.Locale;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
import de.l3s.boilerpipe.BoilerpipeExtractor;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextBlock;
@@ -37,102 +29,40 @@ import de.l3s.boilerpipe.document.TextDo
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.DefaultExtractor;
import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
/**
* Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
* library to automatically extract the main content from a web page.
- *
+ * <p/>
* Use this as a {@link ContentHandler} object passed to
* {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata,
org.apache.tika.parser.ParseContext)}
*/
public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
- private static class RecordedElement {
- public enum ElementType {
- START,
- END,
- CONTINUE
- }
-
- private String uri;
- private String localName;
- private String qName;
- private Attributes attrs;
- private List<char[]> characters;
- private ElementType elementType;
-
- public RecordedElement(String uri, String localName, String qName,
Attributes attrs) {
- this(uri, localName, qName, attrs, ElementType.START);
- }
-
- public RecordedElement(String uri, String localName, String qName) {
- this(uri, localName, qName, null, ElementType.END);
- }
-
- public RecordedElement() {
- this(null, null, null, null, ElementType.CONTINUE);
- }
-
- protected RecordedElement(String uri, String localName, String qName,
Attributes attrs, RecordedElement.ElementType elementType) {
- this.uri = uri;
- this.localName = localName;
- this.qName = qName;
- this.attrs = attrs;
- this.elementType = elementType;
- this.characters = new ArrayList<char[]>();
- }
-
- @Override
- public String toString() {
- return String.format(Locale.ROOT, "<%s> of type %s", localName,
elementType);
- }
-
- public String getUri() {
- return uri;
- }
-
- public String getLocalName() {
- return localName;
- }
-
- public String getQName() {
- return qName;
- }
-
- public Attributes getAttrs() {
- return attrs;
- }
-
- public List<char[]> getCharacters() {
- return characters;
- }
-
- public RecordedElement.ElementType getElementType() {
- return elementType;
- }
- }
-
/**
* The newline character that gets inserted after block elements.
*/
- private static final char[] NL = new char[] { '\n' };
-
+ private static final char[] NL = new char[]{'\n'};
private ContentHandler delegate;
private BoilerpipeExtractor extractor;
-
private boolean includeMarkup;
private boolean inHeader;
private boolean inFooter;
private int headerCharOffset;
private List<RecordedElement> elements;
private TextDocument td;
-
/**
* Creates a new boilerpipe-based content extractor, using the
* {@link DefaultExtractor} extraction rules and "delegate" as the content
handler.
*
- * @param delegate
- * The {@link ContentHandler} object
+ * @param delegate The {@link ContentHandler} object
*/
public BoilerpipeContentHandler(ContentHandler delegate) {
this(delegate, DefaultExtractor.INSTANCE);
@@ -153,10 +83,8 @@ public class BoilerpipeContentHandler ex
* extraction rules. The extracted main content will be passed to the
* <delegate> content handler.
*
- * @param delegate
- * The {@link ContentHandler} object
- * @param extractor
- * Extraction rules to use, e.g. {@link ArticleExtractor}
+ * @param delegate The {@link ContentHandler} object
+ * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
*/
public BoilerpipeContentHandler(ContentHandler delegate,
BoilerpipeExtractor extractor) {
this.td = null;
@@ -164,14 +92,14 @@ public class BoilerpipeContentHandler ex
this.extractor = extractor;
}
- public void setIncludeMarkup(boolean includeMarkup) {
- this.includeMarkup = includeMarkup;
- }
-
public boolean isIncludeMarkup() {
return includeMarkup;
}
+ public void setIncludeMarkup(boolean includeMarkup) {
+ this.includeMarkup = includeMarkup;
+ }
+
/**
* Retrieves the built TextDocument
*
@@ -194,13 +122,15 @@ public class BoilerpipeContentHandler ex
if (includeMarkup) {
elements = new ArrayList<RecordedElement>();
}
- };
+ }
@Override
public void startPrefixMapping(String prefix, String uri) throws
SAXException {
super.startPrefixMapping(prefix, uri);
delegate.startPrefixMapping(prefix, uri);
- };
+ }
+
+ ;
@Override
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
@@ -216,7 +146,9 @@ public class BoilerpipeContentHandler ex
// This happens for the <body> element, if we're not doing markup.
delegate.startElement(uri, localName, qName, atts);
}
- };
+ }
+
+ ;
@Override
public void characters(char[] chars, int offset, int length) throws
SAXException {
@@ -234,7 +166,9 @@ public class BoilerpipeContentHandler ex
System.arraycopy(chars, offset, characters, 0, length);
element.getCharacters().add(characters);
}
- };
+ }
+
+ ;
@Override
public void endElement(String uri, String localName, String qName) throws
SAXException {
@@ -252,7 +186,9 @@ public class BoilerpipeContentHandler ex
elements.add(new RecordedElement(uri, localName, qName));
elements.add(new RecordedElement());
}
- };
+ }
+
+ ;
@Override
public void endDocument() throws SAXException {
@@ -342,4 +278,70 @@ public class BoilerpipeContentHandler ex
delegate.endDocument();
}
+
+ ;
+
+ private static class RecordedElement {
+ private String uri;
+ private String localName;
+ private String qName;
+ private Attributes attrs;
+ private List<char[]> characters;
+ private ElementType elementType;
+ public RecordedElement(String uri, String localName, String qName,
Attributes attrs) {
+ this(uri, localName, qName, attrs, ElementType.START);
+ }
+
+ public RecordedElement(String uri, String localName, String qName) {
+ this(uri, localName, qName, null, ElementType.END);
+ }
+
+ public RecordedElement() {
+ this(null, null, null, null, ElementType.CONTINUE);
+ }
+
+ protected RecordedElement(String uri, String localName, String qName,
Attributes attrs, RecordedElement.ElementType elementType) {
+ this.uri = uri;
+ this.localName = localName;
+ this.qName = qName;
+ this.attrs = attrs;
+ this.elementType = elementType;
+ this.characters = new ArrayList<char[]>();
+ }
+
+ @Override
+ public String toString() {
+ return String.format(Locale.ROOT, "<%s> of type %s", localName,
elementType);
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+ public String getLocalName() {
+ return localName;
+ }
+
+ public String getQName() {
+ return qName;
+ }
+
+ public Attributes getAttrs() {
+ return attrs;
+ }
+
+ public List<char[]> getCharacters() {
+ return characters;
+ }
+
+ public RecordedElement.ElementType getElementType() {
+ return elementType;
+ }
+
+ public enum ElementType {
+ START,
+ END,
+ CONTINUE
+ }
+ }
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
Fri May 29 14:36:21 2015
@@ -29,6 +29,10 @@ import java.util.Set;
@SuppressWarnings("serial")
public class DefaultHtmlMapper implements HtmlMapper {
+ /**
+ * @since Apache Tika 0.8
+ */
+ public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
// Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
private static final Map<String, String> SAFE_ELEMENTS = new
HashMap<String, String>() {{
put("H1", "h1");
@@ -42,7 +46,7 @@ public class DefaultHtmlMapper implement
put("PRE", "pre");
put("BLOCKQUOTE", "blockquote");
put("Q", "q");
-
+
put("UL", "ul");
put("OL", "ol");
put("MENU", "ul");
@@ -59,10 +63,10 @@ public class DefaultHtmlMapper implement
put("TD", "td");
put("ADDRESS", "address");
-
+
// TIKA-460 - add anchors
put("A", "a");
-
+
// TIKA-463 - add additional elements that contain URLs (and their
sub-elements)
put("MAP", "map");
put("AREA", "area");
@@ -75,12 +79,10 @@ public class DefaultHtmlMapper implement
put("INS", "ins");
put("DEL", "del");
}};
-
private static final Set<String> DISCARDABLE_ELEMENTS = new
HashSet<String>() {{
add("STYLE");
add("SCRIPT");
}};
-
// For information on tags & attributes, see:
//
http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
// http://www.w3schools.com/TAGS/
@@ -92,17 +94,17 @@ public class DefaultHtmlMapper implement
put("link", attrSet("charset", "href", "hreflang", "type", "rel",
"rev", "media"));
put("map", attrSet("id", "class", "style", "title", "name"));
put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
- put("object", attrSet("declare", "classid", "codebase", "data",
"type", "codetype", "archive", "standby", "height",
+ put("object", attrSet("declare", "classid", "codebase", "data",
"type", "codetype", "archive", "standby", "height",
"width", "usemap", "name", "tabindex", "align", "border",
"hspace", "vspace"));
put("param", attrSet("id", "name", "value", "valuetype", "type"));
put("blockquote", attrSet("cite"));
put("ins", attrSet("cite", "datetime"));
put("del", attrSet("cite", "datetime"));
put("q", attrSet("cite"));
-
+
// TODO - fill out this set. Include core, i18n, etc sets where
appropriate.
}};
-
+
private static Set<String> attrSet(String... attrs) {
Set<String> result = new HashSet<String>();
for (String attr : attrs) {
@@ -110,18 +112,14 @@ public class DefaultHtmlMapper implement
}
return result;
}
-
- /**
- * @since Apache Tika 0.8
- */
- public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
public String mapSafeElement(String name) {
return SAFE_ELEMENTS.get(name);
}
- /** Normalizes an attribute name. Assumes that the element name
- * is valid and normalized
+ /**
+ * Normalizes an attribute name. Assumes that the element name
+ * is valid and normalized
*/
public String mapSafeAttribute(String elementName, String attributeName) {
Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
@@ -131,7 +129,7 @@ public class DefaultHtmlMapper implement
return null;
}
}
-
+
public boolean isDiscardElement(String name) {
return DISCARDABLE_ELEMENTS.contains(name);
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
Fri May 29 14:36:21 2015
@@ -41,11 +41,11 @@ public class HtmlEncodingDetector implem
// TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
private static final int META_TAG_BUFFER_SIZE = 8192;
-
+
private static final Pattern HTTP_META_PATTERN = Pattern.compile(
- "(?is)<\\s*meta\\s+([^<>]+)"
- );
-
+ "(?is)<\\s*meta\\s+([^<>]+)"
+ );
+
//this should match both the older:
//<meta http-equiv="content-type" content="text/html; charset=xyz"/>
//and
@@ -57,9 +57,9 @@ public class HtmlEncodingDetector implem
//For a more general "not" matcher, try:
//("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)")
private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN =
Pattern.compile(
- ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
- );
-
+ ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
+ );
+
private static final Charset ASCII = Charset.forName("US-ASCII");
public Charset detect(InputStream input, Metadata metadata)
@@ -81,27 +81,27 @@ public class HtmlEncodingDetector implem
// Interpret the head as ASCII and try to spot a meta tag with
// a possible character encoding hint
-
+
String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
Matcher equiv = HTTP_META_PATTERN.matcher(head);
Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher("");
//iterate through meta tags
while (equiv.find()) {
- String attrs = equiv.group(1);
- charsetMatcher.reset(attrs);
- //iterate through charset= and return the first match
- //that is valid
- while (charsetMatcher.find()){
- String candCharset = charsetMatcher.group(1);
- if (CharsetUtils.isSupported(candCharset)){
- try{
- return CharsetUtils.forName(candCharset);
- } catch (Exception e){
- //ignore
- }
- }
- }
+ String attrs = equiv.group(1);
+ charsetMatcher.reset(attrs);
+ //iterate through charset= and return the first match
+ //that is valid
+ while (charsetMatcher.find()) {
+ String candCharset = charsetMatcher.group(1);
+ if (CharsetUtils.isSupported(candCharset)) {
+ try {
+ return CharsetUtils.forName(candCharset);
+ } catch (Exception e) {
+ //ignore
+ }
+ }
+ }
}
return null;
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
Fri May 29 14:36:21 2015
@@ -39,23 +39,17 @@ class HtmlHandler extends TextContentHan
// List of attributes that need to be resolved.
private static final Set<String> URI_ATTRIBUTES =
- new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
-
+ new HashSet<String>(Arrays.asList("src", "href", "longdesc",
"cite"));
+ private static final Pattern ICBM =
+ Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
private final HtmlMapper mapper;
-
private final XHTMLContentHandler xhtml;
-
private final Metadata metadata;
-
+ private final StringBuilder title = new StringBuilder();
private int bodyLevel = 0;
-
private int discardLevel = 0;
-
private int titleLevel = 0;
-
- private boolean isTitleSetToMetadata = false;
-
- private final StringBuilder title = new StringBuilder();
+ private boolean isTitleSetToMetadata = false;
private HtmlHandler(
HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
@@ -140,9 +134,6 @@ class HtmlHandler extends TextContentHan
title.setLength(0);
}
- private static final Pattern ICBM =
- Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
-
/**
* Adds a metadata setting from the HTML <head/> to the Tika metadata
* object. The name and value are normalized where possible.
@@ -159,7 +150,7 @@ class HtmlHandler extends TextContentHan
} else {
metadata.set("ICBM", value);
}
- } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)){
+ } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
//don't overwrite Metadata.CONTENT_TYPE!
MediaType type = MediaType.parse(value);
if (type != null) {
@@ -208,7 +199,7 @@ class HtmlHandler extends TextContentHan
newAttributes.setValue(att, codebase);
} else if (isObject
&& ("data".equals(normAttrName)
- || "classid".equals(normAttrName))) {
+ || "classid".equals(normAttrName))) {
newAttributes.setValue(
att,
resolve(codebase, newAttributes.getValue(att)));
@@ -241,7 +232,7 @@ class HtmlHandler extends TextContentHan
if (titleLevel > 0) {
titleLevel--;
- if (titleLevel == 0 && !isTitleSetToMetadata) {
+ if (titleLevel == 0 && !isTitleSetToMetadata) {
metadata.set(TikaCoreProperties.TITLE,
title.toString().trim());
isTitleSetToMetadata = true;
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
Fri May 29 14:36:21 2015
@@ -37,7 +37,7 @@ public interface HtmlMapper {
*
* @param name HTML element name (upper case)
* @return XHTML element name (lower case), or
- * <code>null</code> if the element is unsafe
+ * <code>null</code> if the element is unsafe
*/
String mapSafeElement(String name);
@@ -47,22 +47,22 @@ public interface HtmlMapper {
*
* @param name HTML element name (upper case)
* @return <code>true</code> if content inside the named element
- * should be ignored, <code>false</code> otherwise
+ * should be ignored, <code>false</code> otherwise
*/
boolean isDiscardElement(String name);
-
-
+
+
/**
* Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
* given attribute is unknown or deemed unsafe for inclusion in the parse
* output, then this method returns <code>null</code> and the attribute
- * will be ignored. This method assumes that the element name
+ * will be ignored. This method assumes that the element name
* is valid and normalised.
*
- * @param elementName HTML element name (lower case)
+ * @param elementName HTML element name (lower case)
* @param attributeName HTML attribute name (lower case)
* @return XHTML attribute name (lower case), or
- * <code>null</code> if the element is unsafe
+ * <code>null</code> if the element is unsafe
*/
String mapSafeAttribute(String elementName, String attributeName);
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Fri May 29 14:36:21 2015
@@ -44,7 +44,9 @@ import org.xml.sax.SAXException;
*/
public class HtmlParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = 7895315240498733128L;
private static final MediaType XHTML = MediaType.application("xhtml+xml");
@@ -52,11 +54,11 @@ public class HtmlParser extends Abstract
private static final MediaType X_ASP = MediaType.application("x-asp");
private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.text("html"),
- XHTML,
- WAP_XHTML,
- X_ASP)));
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.text("html"),
+ XHTML,
+ WAP_XHTML,
+ X_ASP)));
private static final ServiceLoader LOADER =
new ServiceLoader(HtmlParser.class.getClassLoader());
@@ -132,15 +134,15 @@ public class HtmlParser extends Abstract
* will be ignored but the content inside it is still processed. See
* the {@link #isDiscardElement(String)} method for a way to discard
* the entire contents of an element.
- * <p>
+ * <p/>
* Subclasses can override this method to customize the default mapping.
*
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This method will be removed in Tika 1.0.
- * @since Apache Tika 0.5
* @param name HTML element name (upper case)
* @return XHTML element name (lower case), or
- * <code>null</code> if the element is unsafe
+ * <code>null</code> if the element is unsafe
+ * @since Apache Tika 0.5
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
*/
protected String mapSafeElement(String name) {
return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
@@ -151,25 +153,25 @@ public class HtmlParser extends Abstract
* discarded instead of including it in the parse output. Subclasses
* can override this method to customize the set of discarded elements.
*
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This method will be removed in Tika 1.0.
- * @since Apache Tika 0.5
* @param name HTML element name (upper case)
* @return <code>true</code> if content inside the named element
- * should be ignored, <code>false</code> otherwise
+ * should be ignored, <code>false</code> otherwise
+ * @since Apache Tika 0.5
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
*/
protected boolean isDiscardElement(String name) {
return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
}
/**
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This method will be removed in Tika 1.0.
- **/
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
public String mapSafeAttribute(String elementName, String attributeName) {
- return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName,
attributeName) ;
- }
-
+ return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName,
attributeName);
+ }
+
/**
* Adapter class that maintains backwards compatibility with the
* protected HtmlParser methods. Making HtmlParser implement HtmlMapper
@@ -177,17 +179,19 @@ public class HtmlParser extends Abstract
* backwards compatibility with subclasses.
*
* @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This class will be removed in Tika 1.0.
+ * the HTML mapping. This class will be removed in Tika 1.0.
*/
private class HtmlParserMapper implements HtmlMapper {
public String mapSafeElement(String name) {
return HtmlParser.this.mapSafeElement(name);
}
+
public boolean isDiscardElement(String name) {
return HtmlParser.this.isDiscardElement(name);
}
- public String mapSafeAttribute(String elementName, String
attributeName){
- return HtmlParser.this.mapSafeAttribute(elementName,attributeName);
+
+ public String mapSafeAttribute(String elementName, String
attributeName) {
+ return HtmlParser.this.mapSafeAttribute(elementName,
attributeName);
}
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
Fri May 29 14:36:21 2015
@@ -21,7 +21,7 @@ import java.util.Locale;
/**
* Alternative HTML mapping rules that pass the input HTML as-is without any
* modifications.
- *
+ *
* @since Apache Tika 0.8
*/
public class IdentityHtmlMapper implements HtmlMapper {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
Fri May 29 14:36:21 2015
@@ -16,9 +16,8 @@
*/
package org.apache.tika.parser.html;
-import java.util.Locale;
-
import javax.xml.XMLConstants;
+import java.util.Locale;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.xml.sax.Attributes;
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
Fri May 29 14:36:21 2015
@@ -38,25 +38,23 @@ import org.xml.sax.SAXException;
/**
* Parser for the Better Portable Graphics )BPG) File Format.
- *
+ * <p/>
* Documentation on the file format is available from
* http://bellard.org/bpg/bpg_spec.txt
*/
public class BPGParser extends AbstractParser {
+ protected static final int EXTENSION_TAG_EXIF = 1;
+ protected static final int EXTENSION_TAG_ICC_PROFILE = 2;
+ protected static final int EXTENSION_TAG_XMP = 3;
+ protected static final int EXTENSION_TAG_THUMBNAIL = 4;
private static final long serialVersionUID = -161736541253892772L;
-
private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.image("x-bpg"), MediaType.image("bpg"))));
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.image("x-bpg"), MediaType.image("bpg"))));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
-
- protected static final int EXTENSION_TAG_EXIF = 1;
- protected static final int EXTENSION_TAG_ICC_PROFILE = 2;
- protected static final int EXTENSION_TAG_XMP = 3;
- protected static final int EXTENSION_TAG_THUMBNAIL = 4;
public void parse(
InputStream stream, ContentHandler handler,
@@ -65,16 +63,16 @@ public class BPGParser extends AbstractP
// Check for the magic header signature
byte[] signature = new byte[4];
IOUtils.readFully(stream, signature);
- if(signature[0] == (byte)'B' && signature[1] == (byte)'P' &&
- signature[2] == (byte)'G' && signature[3] == (byte)0xfb) {
- // Good, signature found
+ if (signature[0] == (byte) 'B' && signature[1] == (byte) 'P' &&
+ signature[2] == (byte) 'G' && signature[3] == (byte) 0xfb) {
+ // Good, signature found
} else {
- throw new TikaException("BPG magic signature invalid");
+ throw new TikaException("BPG magic signature invalid");
}
-
+
// Grab and decode the first byte
int pdf = stream.read();
-
+
// Pixel format: Greyscale / 4:2:0 / 4:2:2 / 4:4:4
int pixelFormat = pdf & 0x7;
// TODO Identify a suitable metadata key for this
@@ -82,14 +80,14 @@ public class BPGParser extends AbstractP
// Is there an alpha plane as well as a colour plane?
boolean hasAlphaPlane1 = (pdf & 0x8) == 0x8;
// TODO Identify a suitable metadata key for this+hasAlphaPlane2
-
+
// Bit depth minus 8
int bitDepth = (pdf >> 4) + 8;
metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(bitDepth));
-
+
// Grab and decode the second byte
int cer = stream.read();
-
+
// Colour Space: YCbCr / RGB / YCgCo / YCbCrK / CMYK
int colourSpace = cer & 0x15;
switch (colourSpace) {
@@ -109,7 +107,7 @@ public class BPGParser extends AbstractP
metadata.set(Photoshop.COLOR_MODE, "CMYK Colour");
break;
}
-
+
// Are there extensions or not?
boolean hasExtensions = (cer & 16) == 16;
@@ -117,35 +115,35 @@ public class BPGParser extends AbstractP
boolean hasAlphaPlane2 = (cer & 32) == 32;
// cer then holds 2 more booleans - limited range, reserved
-
+
// Width and height next
- int width = (int)EndianUtils.readUE7(stream);
- int height = (int)EndianUtils.readUE7(stream);
+ int width = (int) EndianUtils.readUE7(stream);
+ int height = (int) EndianUtils.readUE7(stream);
metadata.set(TIFF.IMAGE_LENGTH, height);
metadata.set(TIFF.IMAGE_WIDTH, width);
-
+
// Picture Data length
EndianUtils.readUE7(stream);
-
+
// Extension Data Length, if extensions present
long extensionDataLength = 0;
if (hasExtensions)
extensionDataLength = EndianUtils.readUE7(stream);
-
+
// Alpha Data Length, if alpha used
long alphaDataLength = 0;
if (hasAlphaPlane1 || hasAlphaPlane2)
alphaDataLength = EndianUtils.readUE7(stream);
-
+
// Extension Data
if (hasExtensions) {
long extensionsDataSeen = 0;
- ImageMetadataExtractor metadataExtractor =
+ ImageMetadataExtractor metadataExtractor =
new ImageMetadataExtractor(metadata);
-
+
while (extensionsDataSeen < extensionDataLength) {
- int extensionType = (int)EndianUtils.readUE7(stream);
- int extensionLength = (int)EndianUtils.readUE7(stream);
+ int extensionType = (int) EndianUtils.readUE7(stream);
+ int extensionLength = (int) EndianUtils.readUE7(stream);
switch (extensionType) {
case EXTENSION_TAG_EXIF:
metadataExtractor.parseRawExif(stream,
extensionLength, true);
@@ -159,19 +157,19 @@ public class BPGParser extends AbstractP
extensionsDataSeen += extensionLength;
}
}
-
+
// HEVC Header + Data
// Alpha HEVC Header + Data
// We can't do anything with these parts
-
+
// We don't have any helpful text, sorry...
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
-
- protected void handleXMP(InputStream stream, int xmpLength,
- ImageMetadataExtractor extractor) throws IOException,
TikaException, SAXException {
+
+ protected void handleXMP(InputStream stream, int xmpLength,
+ ImageMetadataExtractor extractor) throws
IOException, TikaException, SAXException {
byte[] xmp = new byte[xmpLength];
IOUtils.readFully(stream, xmp);
extractor.parseRawXMP(xmp);
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
Fri May 29 14:36:21 2015
@@ -66,9 +66,9 @@ import org.xml.sax.SAXException;
*/
public class ImageMetadataExtractor {
+ private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6
dp seems to be reasonable
private final Metadata metadata;
private DirectoryHandler[] handlers;
- private static final String GEO_DECIMAL_FORMAT_STRING = "#.######"; // 6
dp seems to be reasonable
/**
* @param metadata to extract to, using default directory handlers
@@ -93,6 +93,15 @@ public class ImageMetadataExtractor {
this.handlers = handlers;
}
+ private static String trimPixels(String s) {
+ //if height/width appears as "100 pixels", trim " pixels"
+ if (s != null) {
+ int i = s.lastIndexOf(" pixels");
+ s = s.substring(0, i);
+ }
+ return s;
+ }
+
public void parseJpeg(File file)
throws IOException, SAXException, TikaException {
try {
@@ -203,15 +212,6 @@ public class ImageMetadataExtractor {
}
}
- private static String trimPixels(String s) {
- //if height/width appears as "100 pixels", trim " pixels"
- if (s != null) {
- int i = s.lastIndexOf(" pixels");
- s = s.substring(0, i);
- }
- return s;
- }
-
/**
* Reads one or more type of Metadata Extractor fields.
*/
@@ -451,8 +451,8 @@ public class ImageMetadataExtractor {
metadata.set(Metadata.RESOLUTION_UNIT,
directory.getDescription(ExifIFD0Directory.TAG_RESOLUTION_UNIT));
}
if (directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_WIDTH))
{
- metadata.set(Metadata.IMAGE_WIDTH,
-
trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)));
+ metadata.set(Metadata.IMAGE_WIDTH,
+
trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_WIDTH)));
}
if
(directory.containsTag(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)) {
metadata.set(Metadata.IMAGE_LENGTH,
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
Fri May 29 14:36:21 2015
@@ -16,6 +16,11 @@
*/
package org.apache.tika.parser.image;
+import javax.imageio.IIOException;
+import javax.imageio.ImageIO;
+import javax.imageio.ImageReader;
+import javax.imageio.metadata.IIOMetadata;
+import javax.imageio.stream.ImageInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
@@ -24,12 +29,6 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
-import javax.imageio.IIOException;
-import javax.imageio.ImageIO;
-import javax.imageio.ImageReader;
-import javax.imageio.metadata.IIOMetadata;
-import javax.imageio.stream.ImageInputStream;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
@@ -46,98 +45,38 @@ import org.xml.sax.SAXException;
public class ImageParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = 7852529269245520335L;
private static final MediaType CANONICAL_BMP_TYPE =
MediaType.image("x-ms-bmp");
private static final MediaType JAVA_BMP_TYPE = MediaType.image("bmp");
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- CANONICAL_BMP_TYPE,
- JAVA_BMP_TYPE,
- MediaType.image("gif"),
- MediaType.image("png"),
- MediaType.image("vnd.wap.wbmp"),
- MediaType.image("x-icon"),
- MediaType.image("x-xcf"))));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- String type = metadata.get(Metadata.CONTENT_TYPE);
- if (type != null) {
- // Java has a different idea of the BMP mime type to
- // what the canonical one is, fix this up.
- if (CANONICAL_BMP_TYPE.toString().equals(type)) {
- type = JAVA_BMP_TYPE.toString();
- }
-
- try {
- Iterator<ImageReader> iterator =
- ImageIO.getImageReadersByMIMEType(type);
- if (iterator.hasNext()) {
- ImageReader reader = iterator.next();
- try {
- ImageInputStream imageStream =
ImageIO.createImageInputStream(
- new CloseShieldInputStream(stream));
- try {
- reader.setInput(imageStream);
-
- metadata.set(Metadata.IMAGE_WIDTH,
Integer.toString(reader.getWidth(0)));
- metadata.set(Metadata.IMAGE_LENGTH,
Integer.toString(reader.getHeight(0)));
- metadata.set("height",
Integer.toString(reader.getHeight(0)));
- metadata.set("width",
Integer.toString(reader.getWidth(0)));
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ CANONICAL_BMP_TYPE,
+ JAVA_BMP_TYPE,
+ MediaType.image("gif"),
+ MediaType.image("png"),
+ MediaType.image("vnd.wap.wbmp"),
+ MediaType.image("x-icon"),
+ MediaType.image("x-xcf"))));
- loadMetadata(reader.getImageMetadata(0), metadata);
- } finally {
- imageStream.close();
- }
- } finally {
- reader.dispose();
- }
- }
-
- // Translate certain Metadata tags from the ImageIO
- // specific namespace into the general Tika one
- setIfPresent(metadata, "CommentExtensions CommentExtension",
TikaCoreProperties.COMMENTS);
- setIfPresent(metadata, "markerSequence com",
TikaCoreProperties.COMMENTS);
- setIfPresent(metadata, "Data BitsPerSample",
Metadata.BITS_PER_SAMPLE);
- } catch (IIOException e) {
- // TIKA-619: There is a known bug in the Sun API when dealing
with GIF images
- // which Tika will just ignore.
- if (!(e.getMessage() != null &&
- e.getMessage().equals("Unexpected block type 0!") &&
- type.equals("image/gif"))) {
- throw new TikaException(type + " parse error", e);
- }
- }
+ private static void setIfPresent(Metadata metadata, String imageIOkey,
String tikaKey) {
+ if (metadata.get(imageIOkey) != null) {
+ metadata.set(tikaKey, metadata.get(imageIOkey));
}
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.endDocument();
}
-
- private static void setIfPresent(Metadata metadata, String imageIOkey,
String tikaKey) {
- if(metadata.get(imageIOkey) != null) {
- metadata.set(tikaKey, metadata.get(imageIOkey));
- }
- }
private static void setIfPresent(Metadata metadata, String imageIOkey,
Property tikaProp) {
- if(metadata.get(imageIOkey) != null) {
- String v = metadata.get(imageIOkey);
- if(v.endsWith(" ")) {
- v = v.substring(0, v.lastIndexOf(' '));
- }
- metadata.set(tikaProp, v);
- }
+ if (metadata.get(imageIOkey) != null) {
+ String v = metadata.get(imageIOkey);
+ if (v.endsWith(" ")) {
+ v = v.substring(0, v.lastIndexOf(' '));
+ }
+ metadata.set(tikaProp, v);
+ }
}
private static void loadMetadata(IIOMetadata imageMetadata, Metadata
metadata) {
@@ -202,4 +141,66 @@ public class ImageParser extends Abstrac
return value;
}
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (type != null) {
+ // Java has a different idea of the BMP mime type to
+ // what the canonical one is, fix this up.
+ if (CANONICAL_BMP_TYPE.toString().equals(type)) {
+ type = JAVA_BMP_TYPE.toString();
+ }
+
+ try {
+ Iterator<ImageReader> iterator =
+ ImageIO.getImageReadersByMIMEType(type);
+ if (iterator.hasNext()) {
+ ImageReader reader = iterator.next();
+ try {
+ ImageInputStream imageStream =
ImageIO.createImageInputStream(
+ new CloseShieldInputStream(stream));
+ try {
+ reader.setInput(imageStream);
+
+ metadata.set(Metadata.IMAGE_WIDTH,
Integer.toString(reader.getWidth(0)));
+ metadata.set(Metadata.IMAGE_LENGTH,
Integer.toString(reader.getHeight(0)));
+ metadata.set("height",
Integer.toString(reader.getHeight(0)));
+ metadata.set("width",
Integer.toString(reader.getWidth(0)));
+
+ loadMetadata(reader.getImageMetadata(0), metadata);
+ } finally {
+ imageStream.close();
+ }
+ } finally {
+ reader.dispose();
+ }
+ }
+
+ // Translate certain Metadata tags from the ImageIO
+ // specific namespace into the general Tika one
+ setIfPresent(metadata, "CommentExtensions CommentExtension",
TikaCoreProperties.COMMENTS);
+ setIfPresent(metadata, "markerSequence com",
TikaCoreProperties.COMMENTS);
+ setIfPresent(metadata, "Data BitsPerSample",
Metadata.BITS_PER_SAMPLE);
+ } catch (IIOException e) {
+ // TIKA-619: There is a known bug in the Sun API when dealing
with GIF images
+ // which Tika will just ignore.
+ if (!(e.getMessage() != null &&
+ e.getMessage().equals("Unexpected block type 0!") &&
+ type.equals("image/gif"))) {
+ throw new TikaException(type + " parse error", e);
+ }
+ }
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java
Fri May 29 14:36:21 2015
@@ -30,9 +30,15 @@ import org.apache.tika.metadata.TikaCore
* ImageMetadataExtractor, but it can be generalized.
*/
public abstract class MetadataFields {
-
+
private static HashSet<String> known;
-
+
+ static {
+ known = new HashSet<String>();
+ setKnownForClass(TikaCoreProperties.class);
+ setKnownForClass(Metadata.class);
+ }
+
private static void setKnownForClass(Class<?> clazz) {
Field[] fields = clazz.getFields();
for (Field f : fields) {
@@ -66,19 +72,13 @@ public abstract class MetadataFields {
}
}
}
-
- static {
- known = new HashSet<String>();
- setKnownForClass(TikaCoreProperties.class);
- setKnownForClass(Metadata.class);
- }
-
+
public static boolean isMetadataField(String name) {
return known.contains(name);
}
-
+
public static boolean isMetadataField(Property property) {
return known.contains(property.getName());
}
-
+
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
Fri May 29 14:36:21 2015
@@ -40,18 +40,20 @@ import org.xml.sax.SAXException;
/**
* Parser for the Adobe Photoshop PSD File Format.
- *
+ * <p/>
* Documentation on the file format is available from
*
http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/PhotoshopFileFormats.htm
*/
public class PSDParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = 883387734607994914L;
private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.image("vnd.adobe.photoshop"))));
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.image("vnd.adobe.photoshop"))));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
@@ -64,24 +66,24 @@ public class PSDParser extends AbstractP
// Check for the magic header signature
byte[] signature = new byte[4];
IOUtils.readFully(stream, signature);
- if(signature[0] == (byte)'8' && signature[1] == (byte)'B' &&
- signature[2] == (byte)'P' && signature[3] == (byte)'S') {
- // Good, signature found
+ if (signature[0] == (byte) '8' && signature[1] == (byte) 'B' &&
+ signature[2] == (byte) 'P' && signature[3] == (byte) 'S') {
+ // Good, signature found
} else {
- throw new TikaException("PSD/PSB magic signature invalid");
+ throw new TikaException("PSD/PSB magic signature invalid");
}
-
+
// Check the version
int version = EndianUtils.readUShortBE(stream);
- if(version == 1 || version == 2) {
- // Good, we support these two
+ if (version == 1 || version == 2) {
+ // Good, we support these two
} else {
- throw new TikaException("Invalid PSD/PSB version " + version);
+ throw new TikaException("Invalid PSD/PSB version " + version);
}
-
+
// Skip the reserved block
IOUtils.readFully(stream, new byte[6]);
-
+
// Number of channels in the image
int numChannels = EndianUtils.readUShortBE(stream);
// TODO Identify a suitable metadata key for this
@@ -91,15 +93,15 @@ public class PSDParser extends AbstractP
int width = EndianUtils.readIntBE(stream);
metadata.set(TIFF.IMAGE_LENGTH, height);
metadata.set(TIFF.IMAGE_WIDTH, width);
-
+
// Depth (bits per channel)
int depth = EndianUtils.readUShortBE(stream);
metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
-
+
// Colour mode, eg Bitmap or RGB
int colorMode = EndianUtils.readUShortBE(stream);
metadata.set(Photoshop.COLOR_MODE,
Photoshop._COLOR_MODE_CHOICES_INDEXED[colorMode]);
-
+
// Next is the Color Mode section
// We don't care about this bit
long colorModeSectionSize = EndianUtils.readIntBE(stream);
@@ -109,92 +111,93 @@ public class PSDParser extends AbstractP
// Check for certain interesting keys here
long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
long read = 0;
- while(read < imageResourcesSectionSize) {
- ResourceBlock rb = new ResourceBlock(stream);
- read += rb.totalLength;
-
- // Is it one we can do something useful with?
- if(rb.id == ResourceBlock.ID_CAPTION) {
- metadata.add(TikaCoreProperties.DESCRIPTION,
rb.getDataAsString());
- } else if(rb.id == ResourceBlock.ID_EXIF_1) {
- // TODO Parse the EXIF info via ImageMetadataExtractor
- } else if(rb.id == ResourceBlock.ID_EXIF_3) {
- // TODO Parse the EXIF info via ImageMetadataExtractor
- } else if(rb.id == ResourceBlock.ID_XMP) {
- // TODO Parse the XMP info via ImageMetadataExtractor
- }
+ while (read < imageResourcesSectionSize) {
+ ResourceBlock rb = new ResourceBlock(stream);
+ read += rb.totalLength;
+
+ // Is it one we can do something useful with?
+ if (rb.id == ResourceBlock.ID_CAPTION) {
+ metadata.add(TikaCoreProperties.DESCRIPTION,
rb.getDataAsString());
+ } else if (rb.id == ResourceBlock.ID_EXIF_1) {
+ // TODO Parse the EXIF info via ImageMetadataExtractor
+ } else if (rb.id == ResourceBlock.ID_EXIF_3) {
+ // TODO Parse the EXIF info via ImageMetadataExtractor
+ } else if (rb.id == ResourceBlock.ID_XMP) {
+ // TODO Parse the XMP info via ImageMetadataExtractor
+ }
}
-
+
// Next is the Layer and Mask Info
// Finally we have Image Data
// We can't do anything with these parts
-
+
// We don't have any helpful text, sorry...
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
-
+
private static class ResourceBlock {
- private static final long SIGNATURE = 0x3842494d; // 8BIM
- private static final int ID_CAPTION = 0x03F0;
- private static final int ID_URL = 0x040B;
- private static final int ID_EXIF_1 = 0x0422;
- private static final int ID_EXIF_3 = 0x0423;
- private static final int ID_XMP = 0x0424;
-
- private int id;
- private String name;
- private byte[] data;
- private int totalLength;
- private ResourceBlock(InputStream stream) throws IOException,
TikaException {
- // Verify the signature
- long sig = EndianUtils.readIntBE(stream);
- if(sig != SIGNATURE) {
- throw new TikaException("Invalid Image Resource Block Signature
Found, got " +
- sig + " 0x" + Long.toHexString(sig) + " but the spec
defines " + SIGNATURE);
- }
-
- // Read the block
- id = EndianUtils.readUShortBE(stream);
-
- StringBuffer nameB = new StringBuffer();
- int nameLen = 0;
- while(true) {
- int v = stream.read();
- nameLen++;
-
- if(v == 0) {
- // The name length is padded to be even
- if(nameLen % 2 == 1) {
- stream.read();
- nameLen++;
+ private static final long SIGNATURE = 0x3842494d; // 8BIM
+ private static final int ID_CAPTION = 0x03F0;
+ private static final int ID_URL = 0x040B;
+ private static final int ID_EXIF_1 = 0x0422;
+ private static final int ID_EXIF_3 = 0x0423;
+ private static final int ID_XMP = 0x0424;
+
+ private int id;
+ private String name;
+ private byte[] data;
+ private int totalLength;
+
+ private ResourceBlock(InputStream stream) throws IOException,
TikaException {
+ // Verify the signature
+ long sig = EndianUtils.readIntBE(stream);
+ if (sig != SIGNATURE) {
+ throw new TikaException("Invalid Image Resource Block
Signature Found, got " +
+ sig + " 0x" + Long.toHexString(sig) + " but the spec
defines " + SIGNATURE);
+ }
+
+ // Read the block
+ id = EndianUtils.readUShortBE(stream);
+
+ StringBuffer nameB = new StringBuffer();
+ int nameLen = 0;
+ while (true) {
+ int v = stream.read();
+ nameLen++;
+
+ if (v == 0) {
+ // The name length is padded to be even
+ if (nameLen % 2 == 1) {
+ stream.read();
+ nameLen++;
+ }
+ break;
+ } else {
+ nameB.append((char) v);
}
- break;
- } else {
- nameB.append((char)v);
- }
- name = nameB.toString();
- }
-
- int dataLen = EndianUtils.readIntBE(stream);
- if(dataLen %2 == 1) {
- // Data Length is even padded
- dataLen = dataLen + 1;
- }
- totalLength = 4 + 2 + nameLen + 4 + dataLen;
-
- data = new byte[dataLen];
- IOUtils.readFully(stream, data);
- }
-
- private String getDataAsString() {
- // Will be null padded
- try {
- return new String(data, 0, data.length-1, "ASCII");
- } catch(UnsupportedEncodingException e) {
- throw new RuntimeException("Something is very broken in your
JVM!");
- }
- }
+ name = nameB.toString();
+ }
+
+ int dataLen = EndianUtils.readIntBE(stream);
+ if (dataLen % 2 == 1) {
+ // Data Length is even padded
+ dataLen = dataLen + 1;
+ }
+ totalLength = 4 + 2 + nameLen + 4 + dataLen;
+
+ data = new byte[dataLen];
+ IOUtils.readFully(stream, data);
+ }
+
+ private String getDataAsString() {
+ // Will be null padded
+ try {
+ return new String(data, 0, data.length - 1, "ASCII");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException("Something is very broken in your
JVM!");
+ }
+ }
}
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
Fri May 29 14:36:21 2015
@@ -35,11 +35,13 @@ import org.xml.sax.SAXException;
public class TiffParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -3941143576535464926L;
private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.image("tiff"));
+ Collections.singleton(MediaType.image("tiff"));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/WebPParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/WebPParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/WebPParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/WebPParser.java
Fri May 29 14:36:21 2015
@@ -35,7 +35,9 @@ import org.xml.sax.SAXException;
public class WebPParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -3941143576535464926L;
private static final Set<MediaType> SUPPORTED_TYPES =
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
Fri May 29 14:36:21 2015
@@ -34,12 +34,10 @@ import org.xml.sax.InputSource;
public class JempboxExtractor {
- private XMPPacketScanner scanner = new XMPPacketScanner();
-
- private Metadata metadata;
-
// The XMP spec says it must be unicode, but for most file formats it
specifies "must be encoded in UTF-8"
private static final String DEFAULT_XMP_CHARSET = IOUtils.UTF_8.name();
+ private XMPPacketScanner scanner = new XMPPacketScanner();
+ private Metadata metadata;
public JempboxExtractor(Metadata metadata) {
this.metadata = metadata;
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
Fri May 29 14:36:21 2015
@@ -27,10 +27,10 @@ import java.io.UnsupportedEncodingExcept
/**
* This class is a parser for XMP packets. By default, it tries to locate the
first XMP packet
* it finds and parses it.
- * <p>
+ * <p/>
* Important: Before you use this class to look for an XMP packet in some
random file, please read
* the chapter on "Scanning Files for XMP Packets" in the XMP specification!
- * <p>
+ * <p/>
* Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser.
* See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant.
*/
@@ -50,19 +50,47 @@ public class XMPPacketScanner {
}
}
+ private static boolean skipAfter(InputStream in, byte[] match) throws
IOException {
+ return skipAfter(in, match, null);
+ }
+
+ private static boolean skipAfter(InputStream in, byte[] match,
OutputStream out)
+ throws IOException {
+ int found = 0;
+ int len = match.length;
+ int b;
+ while ((b = in.read()) >= 0) {
+ if (b == match[found]) {
+ found++;
+ if (found == len) {
+ return true;
+ }
+ } else {
+ if (out != null) {
+ if (found > 0) {
+ out.write(match, 0, found);
+ }
+ out.write(b);
+ }
+ found = 0;
+ }
+ }
+ return false;
+ }
+
/**
* Locates an XMP packet in a stream, parses it and returns the XMP
metadata. If no
* XMP packet is found until the stream ends, null is returned. Note: This
method
* only finds the first XMP packet in a stream. And it cannot determine
whether it
* has found the right XMP packet if there are multiple packets.
- *
+ * <p/>
* Does <em>not</em> close the stream.
* If XMP block was found reading can continue below the block.
- *
- * @param in the InputStream to search
+ *
+ * @param in the InputStream to search
* @param xmlOut to write the XMP packet to
* @return true if XMP packet is found, false otherwise
- * @throws IOException if an I/O error occurs
+ * @throws IOException if an I/O error occurs
* @throws TransformerException if an error occurs while parsing the XMP
packet
*/
public boolean parse(InputStream in, OutputStream xmlOut) throws
IOException {
@@ -84,33 +112,5 @@ public class XMPPacketScanner {
return true;
}
- private static boolean skipAfter(InputStream in, byte[] match) throws
IOException {
- return skipAfter(in, match, null);
- }
-
- private static boolean skipAfter(InputStream in, byte[] match,
OutputStream out)
- throws IOException {
- int found = 0;
- int len = match.length;
- int b;
- while ((b = in.read()) >= 0) {
- if (b == match[found]) {
- found++;
- if (found == len) {
- return true;
- }
- } else {
- if (out != null) {
- if (found > 0) {
- out.write(match, 0, found);
- }
- out.write(b);
- }
- found = 0;
- }
- }
- return false;
- }
-
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/AbstractDBParser.java
Fri May 29 14:36:21 2015
@@ -46,6 +46,11 @@ abstract class AbstractDBParser extends
private Connection connection;
+ protected static EmbeddedDocumentExtractor
getEmbeddedDocumentExtractor(ParseContext context) {
+ return context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+ }
+
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return null;
@@ -90,7 +95,7 @@ abstract class AbstractDBParser extends
}
} finally {
if (xHandler != null) {
- xHandler.endDocument();
+ xHandler.endDocument();
}
try {
close();
@@ -100,11 +105,6 @@ abstract class AbstractDBParser extends
}
}
- protected static EmbeddedDocumentExtractor
getEmbeddedDocumentExtractor(ParseContext context) {
- return context.get(EmbeddedDocumentExtractor.class,
- new ParsingEmbeddedDocumentExtractor(context));
- }
-
/**
* Override this for any special handling of closing the connection.
*
@@ -119,9 +119,9 @@ abstract class AbstractDBParser extends
* Override this for special configuration of the connection, such as
limiting
* the number of rows to be held in memory.
*
- * @param stream stream to use
+ * @param stream stream to use
* @param metadata metadata that could be used in parameterizing the
connection
- * @param context parsecontext that could be used in parameterizing the
connection
+ * @param context parsecontext that could be used in parameterizing the
connection
* @return connection
* @throws java.io.IOException
* @throws org.apache.tika.exception.TikaException
@@ -135,7 +135,7 @@ abstract class AbstractDBParser extends
} catch (ClassNotFoundException e) {
throw new TikaException(e.getMessage());
}
- try{
+ try {
connection = DriverManager.getConnection(connectionString);
} catch (SQLException e) {
throw new IOExceptionWithCause(e);
@@ -145,31 +145,32 @@ abstract class AbstractDBParser extends
/**
* Implement for db specific connection information, e.g.
"jdbc:sqlite:/docs/mydb.db"
- * <p>
+ * <p/>
* Include any optimization settings, user name, password, etc.
- * <p>
- * @param stream stream for processing
- * @param metadata metadata might be useful in determining connection info
+ * <p/>
+ *
+ * @param stream stream for processing
+ * @param metadata metadata might be useful in determining connection
info
* @param parseContext context to use to help create connectionString
* @return connection string to be used by {@link #getConnection}.
* @throws java.io.IOException
- */
+ */
abstract protected String getConnectionString(InputStream stream,
- Metadata metadata, ParseContext
parseContext) throws IOException;
+ Metadata metadata,
ParseContext parseContext) throws IOException;
/**
* JDBC class name, e.g. org.sqlite.JDBC
+ *
* @return jdbc class name
*/
abstract protected String getJDBCClassName();
/**
- *
* Returns the names of the tables to process
*
* @param connection Connection to use to make the sql call(s) to get the
names of the tables
- * @param metadata Metadata to use (potentially) in decision about which
tables to extract
- * @param context ParseContext to use (potentially) in decision about
which tables to extract
+ * @param metadata Metadata to use (potentially) in decision about which
tables to extract
+ * @param context ParseContext to use (potentially) in decision about
which tables to extract
* @return
* @throws java.sql.SQLException
*/
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3DBParser.java
Fri May 29 14:36:21 2015
@@ -36,7 +36,7 @@ import org.sqlite.SQLiteConfig;
/**
* This is the implementation of the db parser for SQLite.
- * <p>
+ * <p/>
* This parser is internal only; it should not be registered in the services
* file or configured in the TikaConfig xml file.
*/
@@ -45,7 +45,6 @@ class SQLite3DBParser extends AbstractDB
protected static final String SQLITE_CLASS_NAME = "org.sqlite.JDBC";
/**
- *
* @param context context
* @return null (always)
*/
@@ -64,7 +63,7 @@ class SQLite3DBParser extends AbstractDB
} catch (ClassNotFoundException e) {
throw new IOExceptionWithCause(e);
}
- try{
+ try {
SQLiteConfig config = new SQLiteConfig();
//good habit, but effectively meaningless here
@@ -80,7 +79,7 @@ class SQLite3DBParser extends AbstractDB
@Override
protected String getConnectionString(InputStream is, Metadata metadata,
ParseContext context) throws IOException {
File dbFile = TikaInputStream.get(is).getFile();
- return "jdbc:sqlite:"+dbFile.getAbsolutePath();
+ return "jdbc:sqlite:" + dbFile.getAbsolutePath();
}
@Override
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3Parser.java
Fri May 29 14:36:21 2015
@@ -15,6 +15,7 @@ package org.apache.tika.parser.jdbc;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
@@ -31,18 +32,19 @@ import org.xml.sax.SAXException;
/**
* This is the main class for parsing SQLite3 files. When {@link #parse} is
called,
* this creates a new {@link org.apache.tika.parser.jdbc.SQLite3DBParser}.
- * <p>
+ * <p/>
* Given potential conflicts of native libraries in web servers, users will
* need to add org.xerial's sqlite-jdbc jar to the class path for this parser
* to work. For development and testing, this jar is specified in
tika-parsers'
* pom.xml, but it is currently set to "provided."
- * <p>
+ * <p/>
* Note that this family of jdbc parsers is designed to treat each CLOB and
each BLOB
* as embedded documents.
- *
*/
public class SQLite3Parser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -752276948656079347L;
private static final MediaType MEDIA_TYPE =
MediaType.application("x-sqlite3");
@@ -51,7 +53,7 @@ public class SQLite3Parser extends Abstr
/**
* Checks to see if class is available for org.sqlite.JDBC.
- * <p>
+ * <p/>
* If not, this class will return an EMPTY_SET for getSupportedTypes()
*/
public SQLite3Parser() {
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jdbc/SQLite3TableReader.java
Fri May 29 14:36:21 2015
@@ -33,14 +33,13 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-
/**
* Concrete class for SQLLite table parsing. This overrides
* column type handling from JDBCRowHandler.
- * <p>
+ * <p/>
* This class is not designed to be thread safe (because of DateFormat)!
* Need to call a new instance for each parse, as AbstractDBParser does.
- * <p>
+ * <p/>
* For now, this silently skips cells of type CLOB, because xerial's jdbc
connector
* does not currently support them.
*/
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=1682489&r1=1682488&r2=1682489&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
Fri May 29 14:36:21 2015
@@ -36,11 +36,13 @@ import org.xml.sax.SAXException;
public class JpegParser extends AbstractParser {
- /** Serial version UID */
+ /**
+ * Serial version UID
+ */
private static final long serialVersionUID = -1355028253756234603L;
private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.image("jpeg"));
+ Collections.singleton(MediaType.image("jpeg"));
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;