Author: jukka
Date: Tue Jan 26 16:23:46 2010
New Revision: 903305
URL: http://svn.apache.org/viewvc?rev=903305&view=rev
Log:
TIKA-362: Add publisher support
Patch by Nick Burch
Added:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub
(with props)
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testVISIO.vsd
(with props)
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=903305&r1=903304&r2=903305&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Tue Jan 26 16:23:46 2010
@@ -24,6 +24,7 @@
import java.util.Locale;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.MarkUnsupportedException;
@@ -33,6 +34,7 @@
import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
@@ -80,52 +82,59 @@
while (entries.hasNext()) {
Entry entry = (Entry) entries.next();
String name = entry.getName();
- if (!(entry instanceof DocumentEntry)) {
- // Skip directory entries
- } else if ("WordDocument".equals(name)) {
- setType(metadata, "application/msword");
- WordExtractor extractor = new WordExtractor(filesystem);
-
- addTextIfAny(xhtml, "header", extractor.getHeaderText());
-
- for (String paragraph : extractor.getParagraphText()) {
- xhtml.element("p", paragraph);
- }
-
- for (String paragraph : extractor.getFootnoteText()) {
- xhtml.element("p", paragraph);
- }
-
- for (String paragraph : extractor.getCommentsText()) {
- xhtml.element("p", paragraph);
- }
-
- for (String paragraph : extractor.getEndnoteText()) {
- xhtml.element("p", paragraph);
- }
-
- addTextIfAny(xhtml, "footer", extractor.getFooterText());
- } else if ("PowerPoint Document".equals(name)) {
- setType(metadata, "application/vnd.ms-powerpoint");
- PowerPointExtractor extractor =
- new PowerPointExtractor(filesystem);
- xhtml.element("p", extractor.getText(true, true));
- } else if ("Workbook".equals(name)) {
- setType(metadata, "application/vnd.ms-excel");
- Locale locale = context.get(Locale.class, Locale.getDefault());
- new ExcelExtractor().parse(filesystem, xhtml, locale);
- } else if ("VisioDocument".equals(name)) {
- setType(metadata, "application/vnd.visio");
- VisioTextExtractor extractor =
- new VisioTextExtractor(filesystem);
- for (String text : extractor.getAllText()) {
- xhtml.element("p", text);
- }
- } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
- // TODO: Cleaner mechanism for detecting Outlook
- outlookExtracted = true;
- setType(metadata, "application/vnd.ms-outlook");
- new OutlookExtractor(filesystem).parse(xhtml, metadata);
+ if (entry instanceof DirectoryEntry) {
+ if ("Quill".equals(name)) {
+ setType(metadata, "application/x-mspublisher");
+ PublisherTextExtractor extractor =
+ new PublisherTextExtractor(filesystem);
+ xhtml.element("p", extractor.getText());
+ }
+ } else if (entry instanceof DocumentEntry) {
+ if ("WordDocument".equals(name)) {
+ setType(metadata, "application/msword");
+ WordExtractor extractor = new WordExtractor(filesystem);
+
+ addTextIfAny(xhtml, "header", extractor.getHeaderText());
+
+ for (String paragraph : extractor.getParagraphText()) {
+ xhtml.element("p", paragraph);
+ }
+
+ for (String paragraph : extractor.getFootnoteText()) {
+ xhtml.element("p", paragraph);
+ }
+
+ for (String paragraph : extractor.getCommentsText()) {
+ xhtml.element("p", paragraph);
+ }
+
+ for (String paragraph : extractor.getEndnoteText()) {
+ xhtml.element("p", paragraph);
+ }
+
+ addTextIfAny(xhtml, "footer", extractor.getFooterText());
+ } else if ("PowerPoint Document".equals(name)) {
+ setType(metadata, "application/vnd.ms-powerpoint");
+ PowerPointExtractor extractor =
+ new PowerPointExtractor(filesystem);
+ xhtml.element("p", extractor.getText(true, true));
+ } else if ("Workbook".equals(name)) {
+ setType(metadata, "application/vnd.ms-excel");
+ Locale locale = context.get(Locale.class,
Locale.getDefault());
+ new ExcelExtractor().parse(filesystem, xhtml, locale);
+ } else if ("VisioDocument".equals(name)) {
+ setType(metadata, "application/vnd.visio");
+ VisioTextExtractor extractor =
+ new VisioTextExtractor(filesystem);
+ for (String text : extractor.getAllText()) {
+ xhtml.element("p", text);
+ }
+ } else if (!outlookExtracted &&
name.startsWith("__substg1.0_")) {
+ // TODO: Cleaner mechanism for detecting Outlook
+ outlookExtracted = true;
+ setType(metadata, "application/vnd.ms-outlook");
+ new OutlookExtractor(filesystem).parse(xhtml, metadata);
+ }
}
}
Added:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java?rev=903305&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
(added)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
Tue Jan 26 16:23:46 2010
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class PublisherParserTest extends TestCase {
+
+ public void testPublisherParser() throws Exception {
+ InputStream input = PublisherParserTest.class.getResourceAsStream(
+ "/test-documents/testPUBLISHER.pub");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new OfficeParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/x-mspublisher",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(null, metadata.get(Metadata.TITLE));
+ assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
+ String content = handler.toString();
+ assertTrue(content.contains("0123456789"));
+ assertTrue(content.contains("abcdef"));
+ } finally {
+ input.close();
+ }
+ }
+
+}
Added:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java?rev=903305&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
(added)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
Tue Jan 26 16:23:46 2010
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class VisioParserTest extends TestCase {
+
+ public void testVisioParser() throws Exception {
+ InputStream input = VisioParserTest.class.getResourceAsStream(
+ "/test-documents/testVISIO.vsd");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new OfficeParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.visio",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("", metadata.get(Metadata.TITLE));
+ assertEquals("Hogwarts", metadata.get(Metadata.AUTHOR));
+ String content = handler.toString();
+ assertTrue(content.contains("Some random text, on a page"));
+ } finally {
+ input.close();
+ }
+ }
+
+}
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub?rev=903305&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testVISIO.vsd
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testVISIO.vsd?rev=903305&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testVISIO.vsd
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream