Author: jukka
Date: Sun Oct 14 05:24:19 2007
New Revision: 584532
URL: http://svn.apache.org/viewvc?rev=584532&view=rev
Log:
TIKA-60 - Rename Microsoft parser classes
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/
- copied from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ms/
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
(with props)
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/FastSavedException.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/FilteredStringWriter.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/PPTConstants.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/PasswordProtectedException.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
(with props)
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/Slide.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/TextBox.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/chp/Word6CHPBinTable.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/Word6Extractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
(with props)
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextBuffer.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java
- copied, changed from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextPiece.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ms/
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
incubator/tika/trunk/src/main/resources/tika-config.xml
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=584532&r1=584531&r2=584532&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Oct 14 05:24:19 2007
@@ -99,3 +99,5 @@
config path in TestParsers (jukka)
45. TIKA-58 - Replace jtidy html parser with nekohtml based parser (siren)
+
+46. TIKA-60 - Rename Microsoft parser classes (jukka)
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
import java.util.Enumeration;
import java.util.Hashtable;
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java?rev=584532&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
Sun Oct 14 05:24:19 2007
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+
+/**
+ * Excel parser
+ */
+public class ExcelParser extends MSParser {
+
+ protected String extractText(InputStream input) throws Exception {
+ StringBuilder builder = new StringBuilder();
+ extractText(new HSSFWorkbook(input), builder);
+ return builder.toString();
+ }
+
+ private void extractText(HSSFWorkbook book, StringBuilder builder) {
+ for (int i = 0; book != null && i < book.getNumberOfSheets(); i++) {
+ extractText(book.getSheetAt(i), builder);
+ }
+ }
+
+ private void extractText(HSSFSheet sheet, StringBuilder builder) {
+ for (int i = 0; sheet != null && i <= sheet.getLastRowNum(); i++) {
+ extractText(sheet.getRow(i), builder);
+ }
+ }
+
+ private void extractText(HSSFRow row, StringBuilder builder) {
+ for (short i = 0; row != null && i < row.getLastCellNum(); i++) {
+ extractText(row.getCell(i), builder);
+ }
+ }
+
+ private void extractText(HSSFCell cell, StringBuilder builder) {
+ if (cell != null) {
+ switch (cell.getCellType()) {
+ case HSSFCell.CELL_TYPE_STRING:
+ addText(cell.getRichStringCellValue().getString(), builder);
+ break;
+ case HSSFCell.CELL_TYPE_NUMERIC:
+ case HSSFCell.CELL_TYPE_FORMULA:
+ addText(Double.toString(cell.getNumericCellValue()), builder);
+ break;
+ default:
+ // ignore
+ }
+ }
+ }
+
+ private void addText(String text, StringBuilder builder) {
+ if (text != null) {
+ text = text.trim();
+ if (text.length() > 0) {
+ if (builder.length() > 0) {
+ builder.append(' ');
+ }
+ builder.append(text);
+ }
+ }
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/FastSavedException.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/FastSavedException.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/FastSavedException.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java
Sun Oct 14 05:24:19 2007
@@ -12,7 +12,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
public class FastSavedException extends Exception {
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/FilteredStringWriter.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/FilteredStringWriter.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/FilteredStringWriter.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java
Sun Oct 14 05:24:19 2007
@@ -15,7 +15,7 @@
* limitations under the License.
*/
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
import java.io.StringWriter;
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java?rev=584532&r1=584528&r2=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java
Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.ms;
+package org.apache.tika.parser.microsoft;
// JDK imports
import java.io.IOException;
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/PPTConstants.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/PPTConstants.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/PPTConstants.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
/**
* Package protected class for the required internal MS PowerPoint constants.
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/PasswordProtectedException.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/PasswordProtectedException.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/PasswordProtectedException.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java
Sun Oct 14 05:24:19 2007
@@ -12,7 +12,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
public class PasswordProtectedException extends Exception {
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java?rev=584532&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
Sun Oct 14 05:24:19 2007
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+
+/**
+ * Power point parser
+ */
+public class PowerPointParser extends MSParser {
+
+ protected String extractText(InputStream input) throws Exception {
+ StringBuilder builder = new StringBuilder();
+ POIFSReader reader = new POIFSReader();
+ reader.registerListener(
+ new ContentReaderListener(builder),
+ PPTConstants.POWERPOINT_DOCUMENT);
+ reader.read(input);
+ return builder.toString();
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java?rev=584532&r1=584528&r2=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.ms;
+package org.apache.tika.parser.microsoft;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/Slide.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/Slide.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/Slide.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java
Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
import java.util.List;
import java.util.Vector;
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/TextBox.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/TextBox.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/TextBox.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java
Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
/**
* Package protected class for the MS Powerpoint TextBox content
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/chp/Word6CHPBinTable.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/chp/Word6CHPBinTable.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/chp/Word6CHPBinTable.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java
Sun Oct 14 05:24:19 2007
@@ -13,7 +13,7 @@
* limitations under the License.
*/
-package org.apache.tika.parser.msword.chp;
+package org.apache.tika.parser.microsoft;
import java.util.List;
import java.util.ArrayList;
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/Word6Extractor.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/Word6Extractor.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/Word6Extractor.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
Sun Oct 14 05:24:19 2007
@@ -13,10 +13,9 @@
* limitations under the License.
*/
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
-import org.apache.tika.parser.msword.chp.Word6CHPBinTable;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.hwpf.model.*;
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=584532&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Sun Oct 14 05:24:19 2007
@@ -0,0 +1,194 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.poi.hwpf.model.CHPBinTable;
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.sprm.SprmIterator;
+import org.apache.poi.hwpf.sprm.SprmOperation;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * Word parser
+ */
+public class WordParser extends MSParser {
+
+ /**
+ * Gets the text from a Word document.
+ *
+ * @param in The InputStream representing the Word file.
+ */
+ public String extractText(InputStream in) throws Exception {
+ POIFSFileSystem fsys = new POIFSFileSystem(in);
+
+ // load our POIFS document streams.
+ DocumentEntry headerProps =
+ (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
+ DocumentInputStream din =
fsys.createDocumentInputStream("WordDocument");
+ byte[] header = new byte[headerProps.getSize()];
+
+ din.read(header);
+ din.close();
+
+ int info = LittleEndian.getShort(header, 0xa);
+ if ((info & 0x4) != 0) {
+ throw new FastSavedException(
+ "Fast-saved files are unsupported at this time");
+ }
+ if ((info & 0x100) != 0) {
+ throw new PasswordProtectedException(
+ "This document is password protected");
+ }
+
+ // determine the version of Word this document came from.
+ int nFib = LittleEndian.getShort(header, 0x2);
+ switch (nFib) {
+ case 101:
+ case 102:
+ case 103:
+ case 104:
+ // this is a Word 6.0 doc send it to the extractor for that
version.
+ Word6Extractor oldExtractor = new Word6Extractor();
+ return oldExtractor.extractText(header);
+ }
+
+ //get the location of the piece table
+ int complexOffset = LittleEndian.getInt(header, 0x1a2);
+
+ // determine which table stream we must use.
+ //Get the information we need from the header
+ String tableName = null;
+ boolean useTable1 = (info & 0x200) != 0;
+ if (useTable1) {
+ tableName = "1Table";
+ } else {
+ tableName = "0Table";
+ }
+
+ DocumentEntry table =
(DocumentEntry)fsys.getRoot().getEntry(tableName);
+ byte[] tableStream = new byte[table.getSize()];
+
+ din = fsys.createDocumentInputStream(tableName);
+
+ din.read(tableStream);
+ din.close();
+
+ int chpOffset = LittleEndian.getInt(header, 0xfa);
+ int chpSize = LittleEndian.getInt(header, 0xfe);
+ int fcMin = LittleEndian.getInt(header, 0x18);
+ CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset,
chpSize, fcMin);
+
+ // load our text pieces and our character runs
+ ComplexFileTable cft = new ComplexFileTable(header, tableStream,
complexOffset, fcMin);
+ TextPieceTable tpt = cft.getTextPieceTable();
+ List textPieces = tpt.getTextPieces();
+
+ // make the POIFS objects available for garbage collection
+ din = null;
+ fsys = null;
+ table = null;
+ headerProps = null;
+
+ List textRuns = cbt.getTextRuns();
+ Iterator runIt = textRuns.iterator();
+ Iterator textIt = textPieces.iterator();
+
+ TextPiece currentPiece = (TextPiece)textIt.next();
+ int currentTextStart = currentPiece.getStart();
+ int currentTextEnd = currentPiece.getEnd();
+
+ WordTextBuffer finalTextBuf = new WordTextBuffer();
+
+ // iterate through all text runs extract the text only if they haven't
been
+ // deleted
+ while (runIt.hasNext()) {
+ CHPX chpx = (CHPX)runIt.next();
+ boolean deleted = isDeleted(chpx.getGrpprl());
+ if (deleted) {
+ continue;
+ }
+
+ int runStart = chpx.getStart();
+ int runEnd = chpx.getEnd();
+
+ while (runStart >= currentTextEnd) {
+ currentPiece = (TextPiece) textIt.next ();
+ currentTextStart = currentPiece.getStart ();
+ currentTextEnd = currentPiece.getEnd ();
+ }
+
+ if (runEnd < currentTextEnd) {
+ String str = currentPiece.substring(runStart -
currentTextStart, runEnd - currentTextStart);
+ finalTextBuf.append(str);
+ } else if (runEnd > currentTextEnd) {
+ while (runEnd > currentTextEnd) {
+ String str = currentPiece.substring(runStart -
currentTextStart,
+ currentTextEnd - currentTextStart);
+ finalTextBuf.append(str);
+ if (textIt.hasNext()) {
+ currentPiece = (TextPiece) textIt.next ();
+ currentTextStart = currentPiece.getStart ();
+ runStart = currentTextStart;
+ currentTextEnd = currentPiece.getEnd ();
+ } else {
+ return finalTextBuf.toString();
+ }
+ }
+ String str = currentPiece.substring(0, runEnd -
currentTextStart);
+ finalTextBuf.append(str);
+ } else {
+ String str = currentPiece.substring(runStart -
currentTextStart, runEnd - currentTextStart);
+ if (textIt.hasNext()) {
+ currentPiece = (TextPiece) textIt.next();
+ currentTextStart = currentPiece.getStart();
+ currentTextEnd = currentPiece.getEnd();
+ }
+ finalTextBuf.append(str);
+ }
+ }
+ return finalTextBuf.toString();
+ }
+
+ /**
+ * Used to determine if a run of text has been deleted.
+ *
+ * @param grpprl The list of sprms for a particular run of text.
+ * @return true if this run of text has been deleted.
+ */
+ private boolean isDeleted(byte[] grpprl) {
+ SprmIterator iterator = new SprmIterator(grpprl,0);
+ while (iterator.hasNext()) {
+ SprmOperation op = iterator.next();
+ // 0 is the operation that signals a FDelRMark operation
+ if (op.getOperation() == 0 && op.getOperand() != 0) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextBuffer.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextBuffer.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextBuffer.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java
Sun Oct 14 05:24:19 2007
@@ -12,7 +12,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
/**
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java
(from r584528,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextPiece.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextPiece.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextPiece.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java
Sun Oct 14 05:24:19 2007
@@ -13,7 +13,7 @@
* limitations under the License.
*/
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
/**
* This class stores info about the data structure describing a chunk of text
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=584532&r1=584531&r2=584532&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Oct 14 05:24:19
2007
@@ -9,15 +9,15 @@
<mime>application/xml</mime>
</parser>
- <parser name="parse-msword"
class="org.apache.tika.parser.msword.MsWordParser">
+ <parser name="parse-msword"
class="org.apache.tika.parser.microsoft.WordParser">
<mime>application/msword</mime>
</parser>
- <parser name="parse-msexcel"
class="org.apache.tika.parser.msexcel.MsExcelParser">
+ <parser name="parse-msexcel"
class="org.apache.tika.parser.microsoft.ExcelParser">
<mime>application/vnd.ms-excel</mime>
</parser>
- <parser name="parse-mspowerpoint"
class="org.apache.tika.parser.mspowerpoint.MsPowerPointParser">
+ <parser name="parse-mspowerpoint"
class="org.apache.tika.parser.microsoft.PowerPointParser">
<mime>application/vnd.ms-powerpoint</mime>
</parser>