Author: ridabenjelloun
Date: Wed Oct 3 07:32:49 2007
New Revision: 581613
URL: http://svn.apache.org/viewvc?rev=581613&view=rev
Log:
TIKA-35 - Extract MsOffice properties, use RereadableInputStream developed by
K. Bennett
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
(with props)
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
(with props)
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Wed Oct 3 07:32:49 2007
@@ -57,7 +57,7 @@
26. TIKA-38 - TXTParser adds a space to the content it reads from a file (K.
Bennett & ridabenjelloun)
-27. TIKA-35 - Extract MsOffice properties (ridabenjelloun)
+27. TIKA-35 - Extract MsOffice properties, use RereadableInputStream devloped
by K. Bennett (ridabenjelloun & K. Bennett)
28. TIKA-39 - Excel parsing improvements (siren & ridabenjelloun)
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
Wed Oct 3 07:32:49 2007
@@ -23,7 +23,6 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
import org.apache.tika.utils.MSExtractor;
-import org.apache.tika.utils.Utils;
/**
* Excel parser
@@ -35,9 +34,8 @@
try {
MSExtractor extractor = new ExcelExtractor();
extractor.setContents(contents);
- InputStream[] isa = Utils.copyInputStream(stream, 2);
- extractor.extractProperties(isa[0]);
- return extractor.extractText(isa[1]);
+ extractor.extract(stream);
+ return extractor.getText();
} catch (IOException e) {
throw e;
} catch (Exception e) {
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
Wed Oct 3 07:32:49 2007
@@ -23,34 +23,25 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
import org.apache.tika.utils.MSExtractor;
-import org.apache.tika.utils.Utils;
/**
* Power point parser
*/
public class MsPowerPointParser extends Parser {
+
protected String parse(InputStream stream, Iterable<Content> contents)
throws IOException, TikaException {
try {
MSExtractor extractor = new PPTExtractor();
extractor.setContents(contents);
- InputStream[] isa = Utils.copyInputStream(stream, 2);
- extractor.extractProperties(isa[0]);
- return extractor.extractText(isa[1]);
+ extractor.extract(stream);
+ return extractor.getText();
} catch (IOException e) {
throw e;
} catch (Exception e) {
throw new TikaException("Error parsing a PowerPoint
document", e);
}
}
-
- /*
- * public List<Content> getContents() {
- * extrator.setContents(getParserConfig().getContents()); try {
- * extrator.extract(getInputStream()); } catch (Exception e) { // TODO
- * Auto-generated catch block e.printStackTrace(); } return
- * getParserConfig().getContents(); }
- */
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
Wed Oct 3 07:32:49 2007
@@ -23,7 +23,6 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
import org.apache.tika.utils.MSExtractor;
-import org.apache.tika.utils.Utils;
/**
* Word parser
@@ -35,9 +34,8 @@
try {
MSExtractor extractor = new WordExtractor();
extractor.setContents(contents);
- InputStream[] isa = Utils.copyInputStream(stream, 2);
- extractor.extractProperties(isa[0]);
- return extractor.extractText(isa[1]);
+ extractor.extract(stream);
+ return extractor.getText();
} catch (IOException e) {
throw e;
} catch (Exception e) {
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
Wed Oct 3 07:32:49 2007
@@ -42,6 +42,8 @@
private Iterable<Content> contents;
+ private final int MEMORY_THRESHOLD = 1024 * 1024;
+
/** Constructs a new Microsoft document extractor. */
public MSExtractor() {
}
@@ -53,18 +55,23 @@
/**
* Extracts properties and text from an MS Document input stream
*/
- public void extractProperties(InputStream input) throws Exception {
+ public void extract(InputStream input) throws Exception {
// First, extract properties
this.reader = new POIFSReader();
this.reader.registerListener(new PropertiesReaderListener(),
SummaryInformation.DEFAULT_STREAM_NAME);
- // input.reset();
+
+ RereadableInputStream ris = new RereadableInputStream(input,
+ MEMORY_THRESHOLD);
if (input.available() > 0) {
- reader.read(input);
+ reader.read(ris);
+ }
+ while (ris.read() != -1) {
}
- // input.reset();
- // this.text = extractText(input);
+ ris.rewind();
+ // Extract document full text
+ this.text = extractText(ris);
}
/**
@@ -77,7 +84,7 @@
*
* @return the content text of the document
*/
- protected String getText() {
+ public String getText() {
return this.text;
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
Wed Oct 3 07:32:49 2007
@@ -17,6 +17,7 @@
package org.apache.tika.utils;
// JDK imports
+import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
@@ -171,7 +172,7 @@
public static String getStringContent(
File documentFile, TikaConfig config, String mimeType)
throws TikaException, IOException {
- InputStream stream = new FileInputStream(documentFile);
+ InputStream stream = new BufferedInputStream(new
FileInputStream(documentFile));
try {
return getStringContent(stream, config, mimeType);
} finally {
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java?rev=581613&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
Wed Oct 3 07:32:49 2007
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+public class RereadableInputStream extends InputStream {
+
+ private InputStream inputStream;
+
+ private int maxBytesInMemory;
+
+ private boolean firstPass = true;
+
+ private boolean bufferIsInFile;
+
+ private byte[] byteBuffer;
+
+ private int size;
+
+ private File storeFile;
+
+ private OutputStream storeOutputStream;
+
+ public RereadableInputStream(InputStream inputStream, int
maxBytesInMemory) {
+ this.inputStream = inputStream;
+ this.maxBytesInMemory = maxBytesInMemory;
+ byteBuffer = new byte[maxBytesInMemory];
+ }
+
+ public int read() throws IOException {
+ int inputByte = inputStream.read();
+ if (firstPass) {
+ saveByte(inputByte);
+ }
+ return inputByte;
+ }
+
+ private void saveByte(int inputByte) throws IOException {
+
+ if (!bufferIsInFile) {
+ boolean switchToFile = (size == (maxBytesInMemory));
+ if (switchToFile) {
+ storeFile = File.createTempFile("streamstore_",
".tmp");
+ bufferIsInFile = true;
+ storeOutputStream = new BufferedOutputStream(
+ new
FileOutputStream(storeFile));
+ storeOutputStream.write(byteBuffer, 0, size);
+ storeOutputStream.write(inputByte);
+ } else {
+ byteBuffer[size] = (byte) inputByte;
+ }
+ } else {
+ storeOutputStream.write(inputByte);
+ }
+ ++size;
+ }
+
+ public void rewind() throws IOException {
+ closeStream();
+ if (storeOutputStream != null) {
+ storeOutputStream.close();
+ storeOutputStream = null;
+ }
+ firstPass = false;
+ boolean newStreamIsInMemory = (size < maxBytesInMemory);
+ inputStream = newStreamIsInMemory ? new
ByteArrayInputStream(byteBuffer)
+ : new BufferedInputStream(new
FileInputStream(storeFile));
+ }
+
+ public void closeStream() throws IOException {
+ if (inputStream != null) {
+ inputStream.close();
+ inputStream = null;
+ }
+ }
+
+ public void close() throws IOException {
+ closeStream();
+ super.close();
+ if (storeFile != null) {
+ storeFile.delete();
+ }
+ }
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/RereadableInputStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=581613&r1=581612&r2=581613&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Wed Oct
3 07:32:49 2007
@@ -158,48 +158,4 @@
}
}
- /**
- * Get the contents of an <code>InputStream</code> as a
- * <code>byte[]</code>.
- * <p>
- * This method buffers the input internally, so there is no need to use
a
- *
- * <code>BufferedInputStream</code>.
- *
- * @param input
- * the <code>InputStream</code> to read from
- * @return the requested byte array
- * @throws NullPointerException
- * if the input is null
- *
- * @throws IOException
- * if an I/O error occurs
- */
- public static byte[] toByteArray(InputStream input) throws IOException {
- ByteArrayOutputStream output = new ByteArrayOutputStream();
- copy(input, output);
- return output.toByteArray();
- }
-
- public static long copy(InputStream input, OutputStream output)
- throws IOException {
- byte[] buffer = new byte[1024];
- long count = 0;
- int n = 0;
- while (-1 != (n = input.read(buffer))) {
- output.write(buffer, 0, n);
- count += n;
- }
- return count;
- }
-
- public static InputStream[] copyInputStream(InputStream is, int
nbCopies) throws IOException {
- InputStream[] isa = new InputStream[nbCopies];
- byte[] content = toByteArray(is);
- for (int i = 0; i < nbCopies; i++) {
- isa[i] = new ByteArrayInputStream(content);
- }
- return isa;
-}
-
}
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java?rev=581613&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
Wed Oct 3 07:32:49 2007
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.utils.RereadableInputStream;
+
+import junit.framework.TestCase;
+
+public class RereadableInputStreamTest extends TestCase {
+
+ private final int TEST_SIZE = 3;
+
+ private final int MEMORY_THRESHOLD = 1;
+
+ private final int NUM_PASSES = 4;
+
+ public void test() throws IOException {
+ File file = createTestFile();
+ InputStream is = new BufferedInputStream(new
FileInputStream(file));
+ RereadableInputStream ris = new RereadableInputStream(is,
+ MEMORY_THRESHOLD);
+ for (int pass = 0; pass < NUM_PASSES; pass++) {
+ for (int byteNum = 0; byteNum < TEST_SIZE; byteNum++) {
+ int byteRead = ris.read();
+ assertEquals("Pass = " + pass + ", byte num
should be "
+ + byteNum + " but is " +
byteRead + ".", byteNum,
+ byteRead);
+ }
+ ris.rewind();
+ }
+ }
+
+ private File createTestFile() throws IOException {
+ File testfile = File.createTempFile("ris_test", ".tmp");
+ FileOutputStream fos = new FileOutputStream(testfile);
+ for (int i = 0; i < TEST_SIZE; i++) {
+ fos.write(i);
+ }
+ fos.close();
+ return testfile;
+ }
+}
Propchange:
incubator/tika/trunk/src/test/java/org/apache/tika/RereadableInputStreamTest.java
------------------------------------------------------------------------------
svn:eol-style = native