Author: ridabenjelloun
Date: Wed Oct  3 07:32:49 2007
New Revision: 581613

URL: http://svn.apache.org/viewvc?rev=581613&view=rev
Log:
TIKA-35 - Extract MsOffice properties, use RereadableInputStream developed by 
K. Bennett

Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
   (with props)
    
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
   (with props)
Modified:
    incubator/tika/trunk/CHANGES.txt
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Wed Oct  3 07:32:49 2007
@@ -57,7 +57,7 @@
 
 26. TIKA-38 - TXTParser adds a space to the content it reads from a file (K. 
Bennett & ridabenjelloun)
 
-27. TIKA-35 - Extract MsOffice properties (ridabenjelloun)
+27. TIKA-35 - Extract MsOffice properties, use RereadableInputStream devloped 
by K. Bennett (ridabenjelloun & K. Bennett)
 
 28. TIKA-39 - Excel parsing improvements (siren & ridabenjelloun)
 

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
 Wed Oct  3 07:32:49 2007
@@ -23,7 +23,6 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.utils.MSExtractor;
-import org.apache.tika.utils.Utils;
 
 /**
  * Excel parser
@@ -35,9 +34,8 @@
                try {
                        MSExtractor extractor = new ExcelExtractor();
                        extractor.setContents(contents);
-                       InputStream[] isa = Utils.copyInputStream(stream, 2);
-                       extractor.extractProperties(isa[0]);
-                       return extractor.extractText(isa[1]);
+                       extractor.extract(stream);
+                       return extractor.getText();
                } catch (IOException e) {
                        throw e;
                } catch (Exception e) {

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
 Wed Oct  3 07:32:49 2007
@@ -23,34 +23,25 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.utils.MSExtractor;
-import org.apache.tika.utils.Utils;
 
 /**
  * Power point parser
  */
 public class MsPowerPointParser extends Parser {
 
+       
        protected String parse(InputStream stream, Iterable<Content> contents)
                        throws IOException, TikaException {
                try {
                        MSExtractor extractor = new PPTExtractor();
                        extractor.setContents(contents);
-                       InputStream[] isa = Utils.copyInputStream(stream, 2);
-                       extractor.extractProperties(isa[0]);
-                       return extractor.extractText(isa[1]);
+                       extractor.extract(stream);
+                       return extractor.getText();
                } catch (IOException e) {
                        throw e;
                } catch (Exception e) {
                        throw new TikaException("Error parsing a PowerPoint 
document", e);
                }
        }
-
-       /*
-        * public List<Content> getContents() {
-        * extrator.setContents(getParserConfig().getContents()); try {
-        * extrator.extract(getInputStream()); } catch (Exception e) { // TODO
-        * Auto-generated catch block e.printStackTrace(); } return
-        * getParserConfig().getContents(); }
-        */
 
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
 Wed Oct  3 07:32:49 2007
@@ -23,7 +23,6 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.utils.MSExtractor;
-import org.apache.tika.utils.Utils;
 
 /**
  * Word parser
@@ -35,9 +34,8 @@
                try {
                        MSExtractor extractor = new WordExtractor();
                        extractor.setContents(contents);
-                       InputStream[] isa = Utils.copyInputStream(stream, 2);
-                       extractor.extractProperties(isa[0]);
-                       return extractor.extractText(isa[1]);
+                       extractor.extract(stream);
+                       return extractor.getText();
                } catch (IOException e) {
                        throw e;
                } catch (Exception e) {

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java 
Wed Oct  3 07:32:49 2007
@@ -42,6 +42,8 @@
 
        private Iterable<Content> contents;
 
+       private final int MEMORY_THRESHOLD = 1024 * 1024;
+
        /** Constructs a new Microsoft document extractor. */
        public MSExtractor() {
        }
@@ -53,18 +55,23 @@
        /**
         * Extracts properties and text from an MS Document input stream
         */
-       public void extractProperties(InputStream input) throws Exception {
+       public void extract(InputStream input) throws Exception {
                // First, extract properties
                this.reader = new POIFSReader();
 
                this.reader.registerListener(new PropertiesReaderListener(),
                                SummaryInformation.DEFAULT_STREAM_NAME);
-               // input.reset();
+
+               RereadableInputStream ris = new RereadableInputStream(input,
+                               MEMORY_THRESHOLD);
                if (input.available() > 0) {
-                       reader.read(input);
+                       reader.read(ris);
+               }
+               while (ris.read() != -1) {
                }
-               // input.reset();
-               // this.text = extractText(input);
+               ris.rewind();
+               // Extract document full text
+               this.text = extractText(ris);
        }
 
        /**
@@ -77,7 +84,7 @@
         * 
         * @return the content text of the document
         */
-       protected String getText() {
+       public String getText() {
                return this.text;
        }
 

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java 
Wed Oct  3 07:32:49 2007
@@ -17,6 +17,7 @@
 package org.apache.tika.utils;
 
 // JDK imports
+import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -171,7 +172,7 @@
     public static String getStringContent(
             File documentFile, TikaConfig config, String mimeType)
             throws TikaException, IOException {
-        InputStream stream = new FileInputStream(documentFile);
+        InputStream stream = new BufferedInputStream(new 
FileInputStream(documentFile));
         try {
             return getStringContent(stream, config, mimeType);
         } finally {

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java?rev=581613&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
 Wed Oct  3 07:32:49 2007
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+public class RereadableInputStream extends InputStream {
+
+       private InputStream inputStream;
+
+       private int maxBytesInMemory;
+
+       private boolean firstPass = true;
+
+       private boolean bufferIsInFile;
+
+       private byte[] byteBuffer;
+
+       private int size;
+
+       private File storeFile;
+
+       private OutputStream storeOutputStream;
+
+       public RereadableInputStream(InputStream inputStream, int 
maxBytesInMemory) {
+               this.inputStream = inputStream;
+               this.maxBytesInMemory = maxBytesInMemory;
+               byteBuffer = new byte[maxBytesInMemory];
+       }
+
+       public int read() throws IOException {
+               int inputByte = inputStream.read();
+               if (firstPass) {
+                       saveByte(inputByte);
+               }
+               return inputByte;
+       }
+
+       private void saveByte(int inputByte) throws IOException {
+
+               if (!bufferIsInFile) {
+                       boolean switchToFile = (size == (maxBytesInMemory));
+                       if (switchToFile) {
+                               storeFile = File.createTempFile("streamstore_", 
".tmp");
+                               bufferIsInFile = true;
+                               storeOutputStream = new BufferedOutputStream(
+                                               new 
FileOutputStream(storeFile));
+                               storeOutputStream.write(byteBuffer, 0, size);
+                               storeOutputStream.write(inputByte);
+                       } else {
+                               byteBuffer[size] = (byte) inputByte;
+                       }
+               } else {
+                       storeOutputStream.write(inputByte);
+               }
+               ++size;
+       }
+
+       public void rewind() throws IOException {
+               closeStream();
+               if (storeOutputStream != null) {
+                       storeOutputStream.close();
+                       storeOutputStream = null;
+               }
+               firstPass = false;
+               boolean newStreamIsInMemory = (size < maxBytesInMemory);
+               inputStream = newStreamIsInMemory ? new 
ByteArrayInputStream(byteBuffer)
+                               : new BufferedInputStream(new 
FileInputStream(storeFile));
+       }
+
+       public void closeStream() throws IOException {
+               if (inputStream != null) {
+                       inputStream.close();
+                       inputStream = null;
+               }
+       }
+
+       public void close() throws IOException {
+               closeStream();
+               super.close();
+               if (storeFile != null) {
+                       storeFile.delete();
+               }
+       }
+}

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Wed Oct 
 3 07:32:49 2007
@@ -158,48 +158,4 @@
                }
        }
 
-       /**
-        * Get the contents of an <code>InputStream</code> as a
-        * <code>byte[]</code>.
-        * <p>
-        * This method buffers the input internally, so there is no need to use 
a
-        * 
-        * <code>BufferedInputStream</code>.
-        * 
-        * @param input
-        *            the <code>InputStream</code> to read from
-        * @return the requested byte array
-        * @throws NullPointerException
-        *             if the input is null
-        * 
-        * @throws IOException
-        *             if an I/O error occurs
-        */
-       public static byte[] toByteArray(InputStream input) throws IOException {
-               ByteArrayOutputStream output = new ByteArrayOutputStream();
-               copy(input, output);
-               return output.toByteArray();
-       }
-
-       public static long copy(InputStream input, OutputStream output)
-                       throws IOException {
-               byte[] buffer = new byte[1024];
-               long count = 0;
-               int n = 0;
-               while (-1 != (n = input.read(buffer))) {
-                       output.write(buffer, 0, n);
-                       count += n;
-               }
-               return count;
-       }
-       
-       public static InputStream[] copyInputStream(InputStream is, int 
nbCopies) throws IOException {      
-               InputStream[] isa = new InputStream[nbCopies];
-               byte[] content = toByteArray(is);
-        for (int i = 0; i < nbCopies; i++) {
-                       isa[i] = new ByteArrayInputStream(content);
-               }
-        return isa;
-}
-
 }

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java?rev=581613&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
 Wed Oct  3 07:32:49 2007
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.utils.RereadableInputStream;
+
+import junit.framework.TestCase;
+
+public class RereadableInputStreamTest extends TestCase {
+
+       private final int TEST_SIZE = 3;
+
+       private final int MEMORY_THRESHOLD = 1;
+
+       private final int NUM_PASSES = 4;
+
+       public void test() throws IOException {
+               File file = createTestFile();
+               InputStream is = new BufferedInputStream(new 
FileInputStream(file));
+               RereadableInputStream ris = new RereadableInputStream(is,
+                               MEMORY_THRESHOLD);
+               for (int pass = 0; pass < NUM_PASSES; pass++) {
+                       for (int byteNum = 0; byteNum < TEST_SIZE; byteNum++) {
+                               int byteRead = ris.read();
+                               assertEquals("Pass = " + pass + ", byte num 
should be "
+                                               + byteNum + " but is " + 
byteRead + ".", byteNum,
+                                               byteRead);
+                       }
+                       ris.rewind();
+               }
+       }
+
+       private File createTestFile() throws IOException {
+               File testfile = File.createTempFile("ris_test", ".tmp");
+               FileOutputStream fos = new FileOutputStream(testfile);
+               for (int i = 0; i < TEST_SIZE; i++) {
+                       fos.write(i);
+               }
+               fos.close();
+               return testfile;
+       }
+}

Propchange: 
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to