svn commit: r903305 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ test/java/org/apache/tika/parser/microsoft/ test/resources/test-documents/

jukka Tue, 26 Jan 2010 08:24:10 -0800

Author: jukka
Date: Tue Jan 26 16:23:46 2010
New Revision: 903305

URL: http://svn.apache.org/viewvc?rev=903305&view=rev
Log:
TIKA-362: Add publisher support


Patch by Nick Burch

Added:
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
    
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub
   (with props)
    
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testVISIO.vsd  
 (with props)
Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=903305&r1=903304&r2=903305&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Tue Jan 26 16:23:46 2010
@@ -24,6 +24,7 @@
 import java.util.Locale;
 
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
 import org.apache.poi.hpsf.CustomProperties;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.MarkUnsupportedException;
@@ -33,6 +34,7 @@
 import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
@@ -80,52 +82,59 @@
         while (entries.hasNext()) {
             Entry entry = (Entry) entries.next();
             String name = entry.getName();
-            if (!(entry instanceof DocumentEntry)) {
-                // Skip directory entries
-            } else if ("WordDocument".equals(name)) {
-                setType(metadata, "application/msword");
-                WordExtractor extractor = new WordExtractor(filesystem);
-
-                addTextIfAny(xhtml, "header", extractor.getHeaderText());
-
-                for (String paragraph : extractor.getParagraphText()) {
-                    xhtml.element("p", paragraph);
-                }
-
-                for (String paragraph : extractor.getFootnoteText()) {
-                    xhtml.element("p", paragraph);
-                }
-
-                for (String paragraph : extractor.getCommentsText()) {
-                    xhtml.element("p", paragraph);
-                }
-
-                for (String paragraph : extractor.getEndnoteText()) {
-                    xhtml.element("p", paragraph);
-                }
-
-                addTextIfAny(xhtml, "footer", extractor.getFooterText());
-            } else if ("PowerPoint Document".equals(name)) {
-                setType(metadata, "application/vnd.ms-powerpoint");
-                PowerPointExtractor extractor =
-                    new PowerPointExtractor(filesystem);
-                xhtml.element("p", extractor.getText(true, true));
-            } else if ("Workbook".equals(name)) {
-                setType(metadata, "application/vnd.ms-excel");
-                Locale locale = context.get(Locale.class, Locale.getDefault());
-                new ExcelExtractor().parse(filesystem, xhtml, locale);
-            } else if ("VisioDocument".equals(name)) {
-                setType(metadata, "application/vnd.visio");
-                VisioTextExtractor extractor =
-                    new VisioTextExtractor(filesystem);
-                for (String text : extractor.getAllText()) {
-                    xhtml.element("p", text);
-                }
-            } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
-                // TODO: Cleaner mechanism for detecting Outlook
-                outlookExtracted = true;
-                setType(metadata, "application/vnd.ms-outlook");
-                new OutlookExtractor(filesystem).parse(xhtml, metadata);
+            if (entry instanceof DirectoryEntry) {
+               if ("Quill".equals(name)) {
+                  setType(metadata, "application/x-mspublisher");
+                  PublisherTextExtractor extractor =
+                      new PublisherTextExtractor(filesystem);
+                  xhtml.element("p", extractor.getText());
+               }
+            } else if (entry instanceof DocumentEntry) {
+               if ("WordDocument".equals(name)) {
+                   setType(metadata, "application/msword");
+                   WordExtractor extractor = new WordExtractor(filesystem);
+   
+                   addTextIfAny(xhtml, "header", extractor.getHeaderText());
+   
+                   for (String paragraph : extractor.getParagraphText()) {
+                       xhtml.element("p", paragraph);
+                   }
+   
+                   for (String paragraph : extractor.getFootnoteText()) {
+                       xhtml.element("p", paragraph);
+                   }
+   
+                   for (String paragraph : extractor.getCommentsText()) {
+                       xhtml.element("p", paragraph);
+                   }
+   
+                   for (String paragraph : extractor.getEndnoteText()) {
+                       xhtml.element("p", paragraph);
+                   }
+   
+                   addTextIfAny(xhtml, "footer", extractor.getFooterText());
+               } else if ("PowerPoint Document".equals(name)) {
+                   setType(metadata, "application/vnd.ms-powerpoint");
+                   PowerPointExtractor extractor =
+                       new PowerPointExtractor(filesystem);
+                   xhtml.element("p", extractor.getText(true, true));
+               } else if ("Workbook".equals(name)) {
+                   setType(metadata, "application/vnd.ms-excel");
+                   Locale locale = context.get(Locale.class, 
Locale.getDefault());
+                   new ExcelExtractor().parse(filesystem, xhtml, locale);
+               } else if ("VisioDocument".equals(name)) {
+                   setType(metadata, "application/vnd.visio");
+                   VisioTextExtractor extractor =
+                       new VisioTextExtractor(filesystem);
+                   for (String text : extractor.getAllText()) {
+                       xhtml.element("p", text);
+                   }
+               } else if (!outlookExtracted && 
name.startsWith("__substg1.0_")) {
+                   // TODO: Cleaner mechanism for detecting Outlook
+                   outlookExtracted = true;
+                   setType(metadata, "application/vnd.ms-outlook");
+                   new OutlookExtractor(filesystem).parse(xhtml, metadata);
+               }
             }
         }
 

Added: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java?rev=903305&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
 (added)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
 Tue Jan 26 16:23:46 2010
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class PublisherParserTest extends TestCase {
+
+    public void testPublisherParser() throws Exception {
+        InputStream input = PublisherParserTest.class.getResourceAsStream(
+                "/test-documents/testPUBLISHER.pub");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata);
+
+            assertEquals(
+                    "application/x-mspublisher",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals(null, metadata.get(Metadata.TITLE));
+            assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
+            String content = handler.toString();
+            assertTrue(content.contains("0123456789"));
+            assertTrue(content.contains("abcdef"));
+        } finally {
+            input.close();
+        }
+    }
+
+}

Added: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java?rev=903305&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
 (added)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
 Tue Jan 26 16:23:46 2010
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class VisioParserTest extends TestCase {
+
+    public void testVisioParser() throws Exception {
+        InputStream input = VisioParserTest.class.getResourceAsStream(
+                "/test-documents/testVISIO.vsd");
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata);
+
+            assertEquals(
+                    "application/vnd.visio",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("", metadata.get(Metadata.TITLE));
+            assertEquals("Hogwarts", metadata.get(Metadata.AUTHOR));
+            String content = handler.toString();
+            assertTrue(content.contains("Some random text, on a page"));
+        } finally {
+            input.close();
+        }
+    }
+
+}

Added: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub?rev=903305&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testPUBLISHER.pub
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testVISIO.vsd
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testVISIO.vsd?rev=903305&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testVISIO.vsd
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

svn commit: r903305 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ test/java/org/apache/tika/parser/microsoft/ test/resources/test-documents/

Reply via email to