Author: jukka
Date: Sun Oct 7 15:52:54 2007
New Revision: 582693
URL: http://svn.apache.org/viewvc?rev=582693&view=rev
Log:
TIKA-48 - Merge MS Extractors and Parsers
- Moved MSExtractor base class to org.apache.tika.ms.MSParser
- Extracted the PropertiesReaderListener class to a top level class
- Merged MS Extractor classes to MS Parsers
- Refactored the Excel parsing functionality into smaller methods
- Various cleanups (indentation, formatting, etc.)
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/ms/
incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java (with
props)
incubator/tika/trunk/src/main/java/org/apache/tika/ms/PropertiesReaderListener.java
(with props)
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/ExcelExtractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/PPTExtractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordExtractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/MSExtractor.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=582693&r1=582692&r2=582693&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Oct 7 15:52:54 2007
@@ -71,3 +71,5 @@
32. TIKA-47 - Remove TikaLogger (jukka)
33. TIKA-46 - Use Metadata in Parser (jukka & mattmann)
+
+34. TIKA-48 - Merge MS Extractors and Parsers (jukka)
Added: incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java?rev=582693&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java Sun Oct
7 15:52:54 2007
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ms;
+
+// JDK imports
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.utils.RereadableInputStream;
+
+/**
+ * Defines a Microsoft document content extractor.
+ */
+public abstract class MSParser implements Parser {
+
+ private final int MEMORY_THRESHOLD = 1024 * 1024;
+
+ /**
+ * Extracts properties and text from an MS Document input stream
+ */
+ public String parse(InputStream input, Metadata metadata)
+ throws IOException, TikaException {
+ RereadableInputStream ris =
+ new RereadableInputStream(input, MEMORY_THRESHOLD);
+ try {
+ // First, extract properties
+ POIFSReader reader = new POIFSReader();
+ reader.registerListener(
+ new PropertiesReaderListener(metadata),
+ SummaryInformation.DEFAULT_STREAM_NAME);
+
+ if (input.available() > 0) {
+ reader.read(ris);
+ }
+ while (ris.read() != -1) {
+ }
+ ris.rewind();
+ // Extract document full text
+ return extractText(ris);
+ } catch (IOException e) {
+ throw e;
+ } catch (TikaException e) {
+ throw e;
+ } catch (Exception e) {
+ throw new TikaException("Parse error", e);
+ } finally {
+ ris.close();
+ }
+ }
+
+ /**
+ * Extracts the text content from a Microsoft document input stream.
+ */
+ protected abstract String extractText(InputStream input) throws Exception;
+
+}
\ No newline at end of file
Propchange: incubator/tika/trunk/src/main/java/org/apache/tika/ms/MSParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/ms/PropertiesReaderListener.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/ms/PropertiesReaderListener.java?rev=582693&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/ms/PropertiesReaderListener.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/ms/PropertiesReaderListener.java
Sun Oct 7 15:52:54 2007
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ms;
+
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+import org.apache.tika.metadata.Metadata;
+
+class PropertiesReaderListener implements POIFSReaderListener {
+
+ private final Metadata metadata;
+
+ public PropertiesReaderListener(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ public void processPOIFSReaderEvent(POIFSReaderEvent event) {
+ if (!event.getName().startsWith(
+ SummaryInformation.DEFAULT_STREAM_NAME)) {
+ return;
+ }
+
+ try {
+ SummaryInformation si = (SummaryInformation)
+ PropertySetFactory.create(event.getStream());
+ if (si.getTitle() != null) {
+ metadata.set(Metadata.TITLE, si.getTitle());
+ }
+ if (si.getAuthor() != null) {
+ metadata.set(Metadata.AUTHOR, si.getAuthor());
+ }
+ if (si.getKeywords() != null) {
+ metadata.set(Metadata.KEYWORDS, si.getKeywords());
+ }
+ if (si.getSubject() != null) {
+ metadata.set(Metadata.SUBJECT, si.getSubject());
+ }
+ if (si.getLastAuthor() != null) {
+ metadata.set(Metadata.LAST_AUTHOR, si.getLastAuthor());
+ }
+ if (si.getComments() != null) {
+ metadata.set(Metadata.COMMENTS, si.getComments());
+ }
+ if (si.getTemplate() != null) {
+ metadata.set(Metadata.TEMPLATE, si.getTemplate());
+ }
+ if (si.getApplicationName() != null) {
+ metadata.set(Metadata.APPLICATION_NAME,
si.getApplicationName());
+ }
+ if (si.getRevNumber() != null) {
+ metadata.set(Metadata.REVISION_NUMBER, si.getRevNumber());
+ }
+ if (si.getCreateDateTime() != null) {
+ metadata.set("creationdate",
si.getCreateDateTime().toString());
+ }
+ if (si.getCharCount() > 0) {
+ metadata.set(
+ Metadata.CHARACTER_COUNT,
+ Integer.toString(si.getCharCount()));
+ }
+ if (si.getEditTime() > 0) {
+ metadata.set("edittime", Long.toString(si.getEditTime()));
+ }
+ if (si.getLastSaveDateTime() != null) {
+ metadata.set(
+ Metadata.LAST_SAVED,
+ si.getLastSaveDateTime().toString());
+ }
+ if (si.getPageCount() > 0) {
+ metadata.set(
+ Metadata.PAGE_COUNT,
+ Integer.toString(si.getPageCount()));
+ }
+ if (si.getSecurity() > 0) {
+ metadata.set(
+ "security", Integer.toString(si.getSecurity()));
+ }
+ if (si.getWordCount() > 0) {
+ metadata.set(
+ Metadata.WORD_COUNT,
+ Integer.toString(si.getWordCount()));
+ }
+ if (si.getLastPrinted() != null) {
+ metadata.set(
+ Metadata.LAST_PRINTED,
+ si.getLastPrinted().toString());
+ }
+ } catch (Exception ex) {
+ }
+ }
+}
\ No newline at end of file
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/ms/PropertiesReaderListener.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java?rev=582693&r1=582692&r2=582693&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/MsExcelParser.java
Sun Oct 7 15:52:54 2007
@@ -16,26 +16,68 @@
*/
package org.apache.tika.parser.msexcel;
-import java.io.IOException;
import java.io.InputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.tika.ms.MSParser;
/**
* Excel parser
*/
-public class MsExcelParser implements Parser {
+public class MsExcelParser extends MSParser {
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
- try {
- return new ExcelExtractor().extract(stream, metadata);
- } catch (IOException e) {
- throw e;
- } catch (Exception e) {
- throw new TikaException("Error parsing an Excel document", e);
+ protected String extractText(InputStream input) throws Exception {
+ StringBuilder builder = new StringBuilder();
+ extractText(new HSSFWorkbook(input), builder);
+ return builder.toString();
+ }
+
+ private void extractText(HSSFWorkbook book, StringBuilder builder) {
+ for (int i = 0; book != null && i < book.getNumberOfSheets(); i++) {
+ extractText(book.getSheetAt(i), builder);
+ }
+ }
+
+ private void extractText(HSSFSheet sheet, StringBuilder builder) {
+ for (int i = 0; sheet != null && i <= sheet.getLastRowNum(); i++) {
+ extractText(sheet.getRow(i), builder);
+ }
+ }
+
+ private void extractText(HSSFRow row, StringBuilder builder) {
+ for (short i = 0; row != null && i < row.getLastCellNum(); i++) {
+ extractText(row.getCell(i), builder);
+ }
+ }
+
+ private void extractText(HSSFCell cell, StringBuilder builder) {
+ if (cell != null) {
+ switch (cell.getCellType()) {
+ case HSSFCell.CELL_TYPE_STRING:
+ addText(cell.getRichStringCellValue().getString(), builder);
+ break;
+ case HSSFCell.CELL_TYPE_NUMERIC:
+ case HSSFCell.CELL_TYPE_FORMULA:
+ addText(Double.toString(cell.getNumericCellValue()), builder);
+ break;
+ default:
+ // ignore
+ }
+ }
+ }
+
+ private void addText(String text, StringBuilder builder) {
+ if (text != null) {
+ text = text.trim();
+ if (text.length() > 0) {
+ if (builder.length() > 0) {
+ builder.append(' ');
+ }
+ builder.append(text);
+ }
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java?rev=582693&r1=582692&r2=582693&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java
Sun Oct 7 15:52:54 2007
@@ -30,27 +30,21 @@
import org.apache.poi.util.StringUtil;
/**
- * Listener to read the content of PowerPoint file and transfere it to the
- * passed <code>StringBuffer</code>.
- *
- *
- *
+ * Listener to read the content of PowerPoint file and transfers it to the
+ * passed <code>StringBuilder</code>.
*/
class ContentReaderListener implements POIFSReaderListener {
static Logger LOG = Logger.getRootLogger();
/** Buffer holding the content of the file */
- protected final transient StringBuffer buf;
+ private final StringBuilder builder;
/**
* Constructs Listener to get content of PowerPoint file.
- *
- * @param content
- * StringBuffer refereing the content of the PowerPoint file.
*/
- public ContentReaderListener(final StringBuffer content) {
- this.buf = content;
+ public ContentReaderListener(StringBuilder builder) {
+ this.builder = builder;
}
/**
@@ -160,13 +154,13 @@
for (int j = 0; j < scontent.size(); j++) {
contentText = scontent.get(j).toString();
- this.buf.append(contentText);
+ builder.append(contentText);
// to avoid concatinated words we add a blank
additional
if (contentText.length() > 0
&& !(contentText.endsWith("\r") || contentText
.endsWith("\n"))) {
- this.buf.append(" ");
+ builder.append(" ");
}
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java?rev=582693&r1=582692&r2=582693&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/MsPowerPointParser.java
Sun Oct 7 15:52:54 2007
@@ -16,27 +16,24 @@
*/
package org.apache.tika.parser.mspowerpoint;
-import java.io.IOException;
import java.io.InputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.tika.ms.MSParser;
/**
* Power point parser
*/
-public class MsPowerPointParser implements Parser {
+public class MsPowerPointParser extends MSParser {
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
- try {
- return new PPTExtractor().extract(stream, metadata);
- } catch (IOException e) {
- throw e;
- } catch (Exception e) {
- throw new TikaException("Error parsing a PowerPoint document", e);
- }
+ protected String extractText(InputStream input) throws Exception {
+ StringBuilder builder = new StringBuilder();
+ POIFSReader reader = new POIFSReader();
+ reader.registerListener(
+ new ContentReaderListener(builder),
+ PPTConstants.POWERPOINT_DOCUMENT);
+ reader.read(input);
+ return builder.toString();
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java?rev=582693&r1=582692&r2=582693&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/MsWordParser.java
Sun Oct 7 15:52:54 2007
@@ -16,27 +16,180 @@
*/
package org.apache.tika.parser.msword;
-import java.io.IOException;
import java.io.InputStream;
+import java.util.Iterator;
+import java.util.List;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
+import org.apache.poi.hwpf.model.CHPBinTable;
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.sprm.SprmIterator;
+import org.apache.poi.hwpf.sprm.SprmOperation;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.ms.MSParser;
/**
* Word parser
*/
-public class MsWordParser implements Parser {
+public class MsWordParser extends MSParser {
- public String parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
- try {
- return new WordExtractor().extract(stream, metadata);
- } catch (IOException e) {
- throw e;
- } catch (Exception e) {
- throw new TikaException("Error parsing a Word document", e);
+ /**
+ * Gets the text from a Word document.
+ *
+ * @param in The InputStream representing the Word file.
+ */
+ public String extractText(InputStream in) throws Exception {
+ POIFSFileSystem fsys = new POIFSFileSystem(in);
+
+ // load our POIFS document streams.
+ DocumentEntry headerProps =
+ (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
+ DocumentInputStream din =
fsys.createDocumentInputStream("WordDocument");
+ byte[] header = new byte[headerProps.getSize()];
+
+ din.read(header);
+ din.close();
+
+ int info = LittleEndian.getShort(header, 0xa);
+ if ((info & 0x4) != 0) {
+ throw new FastSavedException(
+ "Fast-saved files are unsupported at this time");
+ }
+ if ((info & 0x100) != 0) {
+ throw new PasswordProtectedException(
+ "This document is password protected");
+ }
+
+ // determine the version of Word this document came from.
+ int nFib = LittleEndian.getShort(header, 0x2);
+ switch (nFib) {
+ case 101:
+ case 102:
+ case 103:
+ case 104:
+ // this is a Word 6.0 doc send it to the extractor for that
version.
+ Word6Extractor oldExtractor = new Word6Extractor();
+ return oldExtractor.extractText(header);
+ }
+
+ //get the location of the piece table
+ int complexOffset = LittleEndian.getInt(header, 0x1a2);
+
+ // determine which table stream we must use.
+ //Get the information we need from the header
+ String tableName = null;
+ boolean useTable1 = (info & 0x200) != 0;
+ if (useTable1) {
+ tableName = "1Table";
+ } else {
+ tableName = "0Table";
+ }
+
+ DocumentEntry table =
(DocumentEntry)fsys.getRoot().getEntry(tableName);
+ byte[] tableStream = new byte[table.getSize()];
+
+ din = fsys.createDocumentInputStream(tableName);
+
+ din.read(tableStream);
+ din.close();
+
+ int chpOffset = LittleEndian.getInt(header, 0xfa);
+ int chpSize = LittleEndian.getInt(header, 0xfe);
+ int fcMin = LittleEndian.getInt(header, 0x18);
+ CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset,
chpSize, fcMin);
+
+ // load our text pieces and our character runs
+ ComplexFileTable cft = new ComplexFileTable(header, tableStream,
complexOffset, fcMin);
+ TextPieceTable tpt = cft.getTextPieceTable();
+ List textPieces = tpt.getTextPieces();
+
+ // make the POIFS objects available for garbage collection
+ din = null;
+ fsys = null;
+ table = null;
+ headerProps = null;
+
+ List textRuns = cbt.getTextRuns();
+ Iterator runIt = textRuns.iterator();
+ Iterator textIt = textPieces.iterator();
+
+ TextPiece currentPiece = (TextPiece)textIt.next();
+ int currentTextStart = currentPiece.getStart();
+ int currentTextEnd = currentPiece.getEnd();
+
+ WordTextBuffer finalTextBuf = new WordTextBuffer();
+
+ // iterate through all text runs extract the text only if they haven't
been
+ // deleted
+ while (runIt.hasNext()) {
+ CHPX chpx = (CHPX)runIt.next();
+ boolean deleted = isDeleted(chpx.getGrpprl());
+ if (deleted) {
+ continue;
+ }
+
+ int runStart = chpx.getStart();
+ int runEnd = chpx.getEnd();
+
+ while (runStart >= currentTextEnd) {
+ currentPiece = (TextPiece) textIt.next ();
+ currentTextStart = currentPiece.getStart ();
+ currentTextEnd = currentPiece.getEnd ();
+ }
+
+ if (runEnd < currentTextEnd) {
+ String str = currentPiece.substring(runStart -
currentTextStart, runEnd - currentTextStart);
+ finalTextBuf.append(str);
+ } else if (runEnd > currentTextEnd) {
+ while (runEnd > currentTextEnd) {
+ String str = currentPiece.substring(runStart -
currentTextStart,
+ currentTextEnd - currentTextStart);
+ finalTextBuf.append(str);
+ if (textIt.hasNext()) {
+ currentPiece = (TextPiece) textIt.next ();
+ currentTextStart = currentPiece.getStart ();
+ runStart = currentTextStart;
+ currentTextEnd = currentPiece.getEnd ();
+ } else {
+ return finalTextBuf.toString();
+ }
+ }
+ String str = currentPiece.substring(0, runEnd -
currentTextStart);
+ finalTextBuf.append(str);
+ } else {
+ String str = currentPiece.substring(runStart -
currentTextStart, runEnd - currentTextStart);
+ if (textIt.hasNext()) {
+ currentPiece = (TextPiece) textIt.next();
+ currentTextStart = currentPiece.getStart();
+ currentTextEnd = currentPiece.getEnd();
+ }
+ finalTextBuf.append(str);
+ }
+ }
+ return finalTextBuf.toString();
+ }
+
+ /**
+ * Used to determine if a run of text has been deleted.
+ *
+ * @param grpprl The list of sprms for a particular run of text.
+ * @return true if this run of text has been deleted.
+ */
+ private boolean isDeleted(byte[] grpprl) {
+ SprmIterator iterator = new SprmIterator(grpprl,0);
+ while (iterator.hasNext()) {
+ SprmOperation op = iterator.next();
+ // 0 is the operation that signals a FDelRMark operation
+ if (op.getOperation() == 0 && op.getOperand() != 0) {
+ return true;
+ }
}
+ return false;
}
}