Author: jukka
Date: Sun Oct 14 08:41:54 2007
New Revision: 584558

URL: http://svn.apache.org/viewvc?rev=584558&view=rev
Log:
TIKA-63 - Avoid multiple passes over the input stream in Microsoft parsers
    - Use POIFSFileSystem as the source of both metadata and text content
    - Added separate test case classes for the Microsoft parsers
    - Got rid of some extra listeners and exceptions

Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
      - copied, changed from r584532, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
   (with props)
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
   (with props)
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
   (with props)
Removed:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Oct 14 08:41:54 2007
@@ -101,3 +101,6 @@
 45. TIKA-58 - Replace jtidy html parser with nekohtml based parser (siren)
 
 46. TIKA-60 - Rename Microsoft parser classes (jukka)
+
+47. TIKA-63 - Avoid multiple passes over the input stream in Microsoft parsers
+              (jukka)

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
 Sun Oct 14 08:41:54 2007
@@ -16,21 +16,26 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import java.io.InputStream;
+import java.io.IOException;
 
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFRow;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
  * Excel parser
  */
 public class ExcelParser extends OfficeParser {
 
-    protected String extractText(InputStream input) throws Exception {
+    protected String getContentType() {
+        return "application/vnd.ms-excel";
+    }
+
+    protected String extractText(POIFSFileSystem filesystem) throws 
IOException{
         StringBuilder builder = new StringBuilder();
-        extractText(new HSSFWorkbook(input), builder);
+        extractText(new HSSFWorkbook(filesystem), builder);
         return builder.toString();
     }
 

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Sun Oct 14 08:41:54 2007
@@ -20,13 +20,16 @@
 import java.io.IOException;
 import java.io.InputStream;
 
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.HPSFException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.PropertySetFactory;
 import org.apache.poi.hpsf.SummaryInformation;
-import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.XHTMLContentHandler;
-import org.apache.tika.utils.RereadableInputStream;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -35,49 +38,143 @@
  */
 public abstract class OfficeParser implements Parser {
 
-    private final int MEMORY_THRESHOLD = 1024 * 1024;
-
     /**
      * Extracts properties and text from an MS Document input stream
      */
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
-        RereadableInputStream ris =
-            new RereadableInputStream(stream, MEMORY_THRESHOLD, true, false);
-        try {
-            // First, extract properties
-            POIFSReader reader = new POIFSReader();
-            reader.registerListener(
-                    new PropertiesReaderListener(metadata),
-                    SummaryInformation.DEFAULT_STREAM_NAME);
+        POIFSFileSystem filesystem = new POIFSFileSystem(stream);
 
-            if (stream.available() > 0) {
-                reader.read(ris);
-            }
-            while (ris.read() != -1) {
-            }
-            ris.rewind();
-            // Extract document full text
-            XHTMLContentHandler xhtml =
-                new XHTMLContentHandler(handler, metadata);
-            xhtml.startDocument();
-            xhtml.element("p", extractText(ris));
-            xhtml.endDocument();
-        } catch (IOException e) {
-            throw e;
-        } catch (TikaException e) {
-            throw e;
-        } catch (Exception e) {
-            throw new TikaException("Parse error", e);
-        } finally {
-            ris.close();
-        }
+        metadata.set(Metadata.CONTENT_TYPE, getContentType());
+        getMetadata(
+                filesystem, SummaryInformation.DEFAULT_STREAM_NAME, metadata);
+        getMetadata(
+                filesystem, DocumentSummaryInformation.DEFAULT_STREAM_NAME,
+                metadata);
+
+        XHTMLContentHandler xhtml =
+            new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.element("p", extractText(filesystem));
+        xhtml.endDocument();
     }
 
     /**
+     * The content type of the document being parsed.
+     *
+     * @return MIME content type
+     */
+    protected abstract String getContentType();
+
+    /**
      * Extracts the text content from a Microsoft document input stream.
      */
-    protected abstract String extractText(InputStream input) throws Exception;
+    protected abstract String extractText(POIFSFileSystem filesystem)
+        throws IOException, TikaException;
+
+    private void getMetadata(
+            POIFSFileSystem filesystem, String name, Metadata metadata) {
+        try {
+            InputStream stream = filesystem.createDocumentInputStream(name);
+            try {
+                getMetadata(stream, metadata);
+            } finally {
+                stream.close();
+            }
+        } catch (Exception e) {
+            // summary information not available, ignore
+        }
+    }
+
+    private void getMetadata(InputStream stream, Metadata metadata)
+            throws HPSFException, IOException {
+        PropertySet set = PropertySetFactory.create(stream);
+        if (set instanceof SummaryInformation) {
+            getMetadata((SummaryInformation) set, metadata);
+        } else if (set instanceof DocumentSummaryInformation) {
+            getMetadata((DocumentSummaryInformation) set, metadata);
+        }
+    }
+
+    private void getMetadata(
+            SummaryInformation information, Metadata metadata) {
+        if (information.getTitle() != null) {
+            metadata.set(Metadata.TITLE, information.getTitle());
+        }
+        if (information.getAuthor() != null) {
+            metadata.set(Metadata.AUTHOR, information.getAuthor());
+        }
+        if (information.getKeywords() != null) {
+            metadata.set(Metadata.KEYWORDS, information.getKeywords());
+        }
+        if (information.getSubject() != null) {
+            metadata.set(Metadata.SUBJECT, information.getSubject());
+        }
+        if (information.getLastAuthor() != null) {
+            metadata.set(Metadata.LAST_AUTHOR, information.getLastAuthor());
+        }
+        if (information.getComments() != null) {
+            metadata.set(Metadata.COMMENTS, information.getComments());
+        }
+        if (information.getTemplate() != null) {
+            metadata.set(Metadata.TEMPLATE, information.getTemplate());
+        }
+        if (information.getApplicationName() != null) {
+            metadata.set(
+                    Metadata.APPLICATION_NAME,
+                    information.getApplicationName());
+        }
+        if (information.getRevNumber() != null) {
+            metadata.set(Metadata.REVISION_NUMBER, information.getRevNumber());
+        }
+        if (information.getCreateDateTime() != null) {
+            metadata.set(
+                    "creationdate",
+                    information.getCreateDateTime().toString());
+        }
+        if (information.getCharCount() > 0) {
+            metadata.set(
+                    Metadata.CHARACTER_COUNT,
+                    Integer.toString(information.getCharCount()));
+        }
+        if (information.getEditTime() > 0) {
+            metadata.set("edittime", Long.toString(information.getEditTime()));
+        }
+        if (information.getLastSaveDateTime() != null) {
+            metadata.set(
+                    Metadata.LAST_SAVED,
+                    information.getLastSaveDateTime().toString());
+        }
+        if (information.getPageCount() > 0) {
+            metadata.set(
+                    Metadata.PAGE_COUNT,
+                    Integer.toString(information.getPageCount()));
+        }
+        if (information.getSecurity() > 0) {
+            metadata.set(
+                    "security", Integer.toString(information.getSecurity()));
+        }
+        if (information.getWordCount() > 0) {
+            metadata.set(
+                    Metadata.WORD_COUNT,
+                    Integer.toString(information.getWordCount()));
+        }
+        if (information.getLastPrinted() != null) {
+            metadata.set(
+                    Metadata.LAST_PRINTED,
+                    information.getLastPrinted().toString());
+        }
+    }
+
+    private void getMetadata(
+            DocumentSummaryInformation information, Metadata metadata) {
+        if (information.getCompany() != null) {
+            metadata.set("company", information.getCompany());
+        }
+        if (information.getManager() != null) {
+            metadata.set("manager", information.getManager());
+        }
+    }
 
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
 Sun Oct 14 08:41:54 2007
@@ -47,11 +47,6 @@
   /** ATOM ID of drawing group area */
   public static final long PPT_ATOM_DRAWINGGROUP = 61448L;
 
-  /** Name for PowerPoint Documents within the file */
-  public static final String POWERPOINT_DOCUMENT = "PowerPoint Document";
-
-
-
   /**
    * Protected constructor to prevent instantiation.
    */

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
 (from r584532, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java&r1=584532&r2=584558&rev=584558&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
 Sun Oct 14 08:41:54 2007
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.InputStream;
 import java.util.Enumeration;
 import java.util.Hashtable;
 import java.util.List;
@@ -23,17 +24,10 @@
 
 import org.apache.log4j.Logger;
 import org.apache.poi.hdf.extractor.Utils;
-import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
-import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.StringUtil;
 
-/**
- * Listener to read the content of PowerPoint file and transfers it to the
- * passed <code>StringBuilder</code>.
- */
-class ContentReaderListener implements POIFSReaderListener {
+class PowerPointExtractor {
 
     static Logger LOG = Logger.getRootLogger();
 
@@ -43,31 +37,15 @@
     /**
      * Constructs Listener to get content of PowerPoint file.
      */
-    public ContentReaderListener(StringBuilder builder) {
+    public PowerPointExtractor(StringBuilder builder) {
         this.builder = builder;
     }
 
     /**
      * Reads the internal PowerPoint document stream.
-     * 
-     * @see 
org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
      */
-    public void processPOIFSReaderEvent(final POIFSReaderEvent event) {
-
-        if (event == null
-                || event.getName() == null
-                || !event.getName()
-                        .startsWith(PPTConstants.POWERPOINT_DOCUMENT)) {
-
-            LOG
-                    .warn("Stream not processed. It is not a PowerPoint 
document: : "
-                            + event.getName());
-
-            return;
-        }
-
+    public void extract(InputStream dis) {
         try {
-            final DocumentInputStream dis = event.getStream();
             final byte pptdata[] = new byte[dis.available()];
             dis.read(pptdata, 0, dis.available());
             int offset = 0;
@@ -457,4 +435,5 @@
 
         return slides;
     }
+
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
 Sun Oct 14 08:41:54 2007
@@ -16,22 +16,35 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.IOException;
 import java.io.InputStream;
 
-import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
 /**
  * Power point parser
  */
 public class PowerPointParser extends OfficeParser {
 
-    protected String extractText(InputStream input) throws Exception {
+    /**
+     *  Name of a PowerPoint document within a POIFS file system
+     */
+    private  static final String POWERPOINT = "PowerPoint Document";
+
+    protected String getContentType() {
+        return "application/vnd.ms-powerpoint";
+    }
+
+    protected String extractText(POIFSFileSystem filesystem) throws 
IOException {
         StringBuilder builder = new StringBuilder();
-        POIFSReader reader = new POIFSReader();
-        reader.registerListener(
-                new ContentReaderListener(builder),
-                PPTConstants.POWERPOINT_DOCUMENT);
-        reader.read(input);
+
+        InputStream stream = filesystem.createDocumentInputStream(POWERPOINT);
+        try {
+            new PowerPointExtractor(builder).extract(stream);
+        } finally {
+            stream.close();
+        }
+
         return builder.toString();
     }
 

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
 Sun Oct 14 08:41:54 2007
@@ -20,6 +20,7 @@
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.hwpf.model.*;
 
+import java.io.IOException;
 import java.util.*;
 
 /**
@@ -44,8 +45,7 @@
    * @return The text from the document
    * @throws Exception If there are any unexpected exceptions.
    */
-  public String extractText(byte[] mainStream) throws Exception
-  {
+  public String extractText(byte[] mainStream) throws IOException {
     int fcMin = LittleEndian.getInt(mainStream, 0x18);
     int fcMax = LittleEndian.getInt(mainStream, 0x1C);
 

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
 Sun Oct 14 08:41:54 2007
@@ -16,7 +16,7 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import java.io.InputStream;
+import java.io.IOException;
 import java.util.Iterator;
 import java.util.List;
 
@@ -31,20 +31,24 @@
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.LittleEndian;
+import org.apache.tika.exception.TikaException;
 
 /**
  * Word parser
  */
 public class WordParser extends OfficeParser {
 
+    protected String getContentType() {
+        return "application/msword";
+    }
+
     /**
      * Gets the text from a Word document.
      *
      * @param in The InputStream representing the Word file.
      */
-    public String extractText(InputStream in) throws Exception {
-        POIFSFileSystem fsys = new POIFSFileSystem(in);
-
+    public String extractText(POIFSFileSystem fsys)
+            throws IOException, TikaException {
         // load our POIFS document streams.
         DocumentEntry headerProps =
             (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
@@ -56,12 +60,10 @@
 
         int info = LittleEndian.getShort(header, 0xa);
         if ((info & 0x4) != 0) {
-            throw new FastSavedException(
-                    "Fast-saved files are unsupported at this time");
+            throw new TikaException("Fast-saved files are unsupported");
         }
         if ((info & 0x100) != 0) {
-            throw new PasswordProtectedException(
-                    "This document is password protected");
+            throw new TikaException("This document is password protected");
         }
 
         // determine the version of Word this document came from.

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=584558&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 Sun Oct 14 08:41:54 2007
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class ExcelParserTest extends TestCase {
+
+    public void testExcelParser() throws Exception {
+        InputStream input = ExcelParserTest.class.getResourceAsStream(
+                "/test-documents/testEXCEL.xls");
+        try {
+            Metadata metadata = new Metadata();
+            StringWriter writer = new StringWriter();
+            ContentHandler handler = new WriteOutContentHandler(writer);
+            new ExcelParser().parse(input, handler, metadata);
+
+            assertEquals(
+                    "application/vnd.ms-excel",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Simple Excel document", 
metadata.get(Metadata.TITLE));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            String content = writer.toString();
+            assertTrue(content.contains("Sample Excel Worksheet"));
+            assertTrue(content.contains("Numbers and their Squares"));
+            assertTrue(content.contains("9.0"));
+            assertTrue(content.contains("196.0"));
+        } finally {
+            input.close();
+        }
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=584558&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
 Sun Oct 14 08:41:54 2007
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class PowerPointParserTest extends TestCase {
+
+    public void testPowerPointParser() throws Exception {
+        InputStream input = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT.ppt");
+        try {
+            Metadata metadata = new Metadata();
+            StringWriter writer = new StringWriter();
+            ContentHandler handler = new WriteOutContentHandler(writer);
+            new PowerPointParser().parse(input, handler, metadata);
+
+            assertEquals(
+                    "application/vnd.ms-powerpoint",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Powerpoint Slide", 
metadata.get(Metadata.TITLE));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            String content = writer.toString();
+            assertTrue(content.contains("Sample Powerpoint Slide"));
+            assertTrue(content.contains("Powerpoint X for Mac"));
+        } finally {
+            input.close();
+        }
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=584558&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
 Sun Oct 14 08:41:54 2007
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class WordParserTest extends TestCase {
+
+    public void testWordParser() throws Exception {
+        InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD.doc");
+        try {
+            Metadata metadata = new Metadata();
+            StringWriter writer = new StringWriter();
+            ContentHandler handler = new WriteOutContentHandler(writer);
+            new WordParser().parse(input, handler, metadata);
+
+            assertEquals(
+                    "application/msword",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            String content = writer.toString();
+            assertTrue(content.contains("Sample Word Document"));
+        } finally {
+            input.close();
+        }
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to