Author: jukka
Date: Sun Mar 9 03:27:18 2008
New Revision: 635208
URL: http://svn.apache.org/viewvc?rev=635208&view=rev
Log:
TIKA-123: Structured MS Office parsing
- Moved property file parsing to a separate Parser class
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635208&r1=635207&r2=635208&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Sun Mar 9 03:27:18 2008
@@ -16,14 +16,11 @@
*/
package org.apache.tika.parser.microsoft;
-// JDK imports
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.HPSFException;
-import org.apache.poi.hpsf.PropertySet;
-import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
@@ -31,6 +28,7 @@
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
/**
* Defines a Microsoft document content extractor.
@@ -70,106 +68,17 @@
throws IOException, SAXException, TikaException;
private void getMetadata(
- POIFSFileSystem filesystem, String name, Metadata metadata) {
+ POIFSFileSystem filesystem, String name, Metadata metadata)
+ throws IOException, SAXException, TikaException {
try {
InputStream stream = filesystem.createDocumentInputStream(name);
try {
- getMetadata(stream, metadata);
+ new PropertyParser().parse(stream, new DefaultHandler(),
metadata);
} finally {
stream.close();
}
- } catch (Exception e) {
+ } catch (FileNotFoundException e) {
// summary information not available, ignore
- }
- }
-
- private void getMetadata(InputStream stream, Metadata metadata)
- throws HPSFException, IOException {
- PropertySet set = PropertySetFactory.create(stream);
- if (set instanceof SummaryInformation) {
- getMetadata((SummaryInformation) set, metadata);
- } else if (set instanceof DocumentSummaryInformation) {
- getMetadata((DocumentSummaryInformation) set, metadata);
- }
- }
-
- private void getMetadata(
- SummaryInformation information, Metadata metadata) {
- if (information.getTitle() != null) {
- metadata.set(Metadata.TITLE, information.getTitle());
- }
- if (information.getAuthor() != null) {
- metadata.set(Metadata.AUTHOR, information.getAuthor());
- }
- if (information.getKeywords() != null) {
- metadata.set(Metadata.KEYWORDS, information.getKeywords());
- }
- if (information.getSubject() != null) {
- metadata.set(Metadata.SUBJECT, information.getSubject());
- }
- if (information.getLastAuthor() != null) {
- metadata.set(Metadata.LAST_AUTHOR, information.getLastAuthor());
- }
- if (information.getComments() != null) {
- metadata.set(Metadata.COMMENTS, information.getComments());
- }
- if (information.getTemplate() != null) {
- metadata.set(Metadata.TEMPLATE, information.getTemplate());
- }
- if (information.getApplicationName() != null) {
- metadata.set(
- Metadata.APPLICATION_NAME,
- information.getApplicationName());
- }
- if (information.getRevNumber() != null) {
- metadata.set(Metadata.REVISION_NUMBER, information.getRevNumber());
- }
- if (information.getCreateDateTime() != null) {
- metadata.set(
- "creationdate",
- information.getCreateDateTime().toString());
- }
- if (information.getCharCount() > 0) {
- metadata.set(
- Metadata.CHARACTER_COUNT,
- Integer.toString(information.getCharCount()));
- }
- if (information.getEditTime() > 0) {
- metadata.set("edittime", Long.toString(information.getEditTime()));
- }
- if (information.getLastSaveDateTime() != null) {
- metadata.set(
- Metadata.LAST_SAVED,
- information.getLastSaveDateTime().toString());
- }
- if (information.getPageCount() > 0) {
- metadata.set(
- Metadata.PAGE_COUNT,
- Integer.toString(information.getPageCount()));
- }
- if (information.getSecurity() > 0) {
- metadata.set(
- "security", Integer.toString(information.getSecurity()));
- }
- if (information.getWordCount() > 0) {
- metadata.set(
- Metadata.WORD_COUNT,
- Integer.toString(information.getWordCount()));
- }
- if (information.getLastPrinted() != null) {
- metadata.set(
- Metadata.LAST_PRINTED,
- information.getLastPrinted().toString());
- }
- }
-
- private void getMetadata(
- DocumentSummaryInformation information, Metadata metadata) {
- if (information.getCompany() != null) {
- metadata.set("company", information.getCompany());
- }
- if (information.getManager() != null) {
- metadata.set("manager", information.getManager());
}
}
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java?rev=635208&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertyParser.java
Sun Mar 9 03:27:18 2008
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for HPSF property streams within Microsoft Office files.
+ */
+public class PropertyParser implements Parser {
+
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ try {
+ PropertySet properties =
+ new PropertySet(IOUtils.toByteArray(stream));
+ if (properties.isSummaryInformation()) {
+ SummaryInformation information = new
SummaryInformation(properties);
+ set(metadata, Metadata.TITLE, information.getTitle());
+ set(metadata, Metadata.AUTHOR, information.getAuthor());
+ set(metadata, Metadata.KEYWORDS, information.getKeywords());
+ set(metadata, Metadata.SUBJECT, information.getSubject());
+ set(metadata, Metadata.LAST_AUTHOR,
information.getLastAuthor());
+ set(metadata, Metadata.COMMENTS, information.getComments());
+ set(metadata, Metadata.TEMPLATE, information.getTemplate());
+ set(metadata, Metadata.APPLICATION_NAME,
information.getApplicationName());
+ set(metadata, Metadata.REVISION_NUMBER,
information.getRevNumber());
+ set(metadata, "creationdate", information.getCreateDateTime());
+ set(metadata, Metadata.CHARACTER_COUNT,
information.getCharCount());
+ set(metadata, "edittime", information.getEditTime());
+ set(metadata, Metadata.LAST_SAVED,
information.getLastSaveDateTime());
+ set(metadata, Metadata.PAGE_COUNT, information.getPageCount());
+ set(metadata, "security", information.getSecurity());
+ set(metadata, Metadata.WORD_COUNT, information.getWordCount());
+ set(metadata, Metadata.LAST_PRINTED,
information.getLastPrinted());
+ }
+ if (properties.isDocumentSummaryInformation()) {
+ DocumentSummaryInformation information = new
DocumentSummaryInformation(properties);
+ set(metadata, "company", information.getCompany());
+ set(metadata, "manager", information.getManager());
+ }
+
+ // No content, just metadata
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ } catch (NoPropertySetStreamException e) {
+ throw new TikaException("Not a HPSF document", e);
+ } catch (UnexpectedPropertySetTypeException e) {
+ throw new TikaException("Unexpected HPSF document", e);
+ }
+ }
+
+ private static void set(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.set(name, value);
+ }
+ }
+
+ private static void set(Metadata metadata, String name, Date value) {
+ if (value != null) {
+ metadata.set(name, value.toString());
+ }
+ }
+
+ private static void set(Metadata metadata, String name, long value) {
+ if (value > 0) {
+ metadata.set(name, Long.toString(value));
+ }
+ }
+
+}