Author: jukka
Date: Sun Oct 14 05:24:19 2007
New Revision: 584532

URL: http://svn.apache.org/viewvc?rev=584532&view=rev
Log:
TIKA-60 - Rename Microsoft parser classes

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/
      - copied from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ms/
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
   (with props)
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/FastSavedException.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/FilteredStringWriter.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/PPTConstants.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/PasswordProtectedException.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
   (with props)
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/Slide.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/TextBox.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/chp/Word6CHPBinTable.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/Word6Extractor.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
   (with props)
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextBuffer.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java
      - copied, changed from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextPiece.java
Removed:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/ms/
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/msexcel/
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/
Modified:
    incubator/tika/trunk/CHANGES.txt
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
    incubator/tika/trunk/src/main/resources/tika-config.xml

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=584532&r1=584531&r2=584532&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Oct 14 05:24:19 2007
@@ -99,3 +99,5 @@
               config path in TestParsers (jukka)
 
 45. TIKA-58 - Replace jtidy html parser with nekohtml based parser (siren)
+
+46. TIKA-60 - Rename Microsoft parser classes (jukka)

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/ContentReaderListener.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
 Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
 
 import java.util.Enumeration;
 import java.util.Hashtable;

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java?rev=584532&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
 Sun Oct 14 05:24:19 2007
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+
+/**
+ * Excel parser
+ */
+public class ExcelParser extends MSParser {
+
+    protected String extractText(InputStream input) throws Exception {
+        StringBuilder builder = new StringBuilder();
+        extractText(new HSSFWorkbook(input), builder);
+        return builder.toString();
+    }
+
+    private void extractText(HSSFWorkbook book, StringBuilder builder) {
+        for (int i = 0; book != null && i < book.getNumberOfSheets(); i++) {
+            extractText(book.getSheetAt(i), builder);
+        }
+    }
+
+    private void extractText(HSSFSheet sheet, StringBuilder builder) {
+        for (int i = 0; sheet != null && i <= sheet.getLastRowNum(); i++) {
+            extractText(sheet.getRow(i), builder);
+        }
+    }
+
+    private void extractText(HSSFRow row, StringBuilder builder) {
+        for (short i = 0; row != null && i < row.getLastCellNum(); i++) {
+            extractText(row.getCell(i), builder);
+        }
+    }
+
+    private void extractText(HSSFCell cell, StringBuilder builder) {
+        if (cell != null) {
+            switch (cell.getCellType()) {
+            case HSSFCell.CELL_TYPE_STRING:
+                addText(cell.getRichStringCellValue().getString(), builder);
+                break;
+            case HSSFCell.CELL_TYPE_NUMERIC:
+            case HSSFCell.CELL_TYPE_FORMULA:
+                addText(Double.toString(cell.getNumericCellValue()), builder);
+                break;
+            default:
+                // ignore
+            } 
+        }
+    }
+
+    private void addText(String text, StringBuilder builder) {
+        if (text != null) {
+            text = text.trim();
+            if (text.length() > 0) {
+                if (builder.length() > 0) {
+                    builder.append(' ');
+                }
+                builder.append(text);
+            }
+        }
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/FastSavedException.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/FastSavedException.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/FastSavedException.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java
 Sun Oct 14 05:24:19 2007
@@ -12,7 +12,7 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
 
 
 public class FastSavedException extends Exception {

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/FilteredStringWriter.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/FilteredStringWriter.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/FilteredStringWriter.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FilteredStringWriter.java
 Sun Oct 14 05:24:19 2007
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
 
 import java.io.StringWriter;
 

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java?rev=584532&r1=584528&r2=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/MSParser.java
 Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.ms;
+package org.apache.tika.parser.microsoft;
 
 // JDK imports
 import java.io.IOException;

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/PPTConstants.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/PPTConstants.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/PPTConstants.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
 Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
 
 /**
  * Package protected class for the required internal MS PowerPoint constants.

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/PasswordProtectedException.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/PasswordProtectedException.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/PasswordProtectedException.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java
 Sun Oct 14 05:24:19 2007
@@ -12,7 +12,7 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
 
 
 public class PasswordProtectedException extends Exception {

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java?rev=584532&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
 Sun Oct 14 05:24:19 2007
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+
+/**
+ * Power point parser
+ */
+public class PowerPointParser extends MSParser {
+
+    protected String extractText(InputStream input) throws Exception {
+        StringBuilder builder = new StringBuilder();
+        POIFSReader reader = new POIFSReader();
+        reader.registerListener(
+                new ContentReaderListener(builder),
+                PPTConstants.POWERPOINT_DOCUMENT);
+        reader.read(input);
+        return builder.toString();
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java?rev=584532&r1=584528&r2=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
 Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.ms;
+package org.apache.tika.parser.microsoft;
 
 import org.apache.poi.hpsf.PropertySetFactory;
 import org.apache.poi.hpsf.SummaryInformation;

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java 
(from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/Slide.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/Slide.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/Slide.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java 
Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
 
 import java.util.List;
 import java.util.Vector;

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/TextBox.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/TextBox.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mspowerpoint/TextBox.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/TextBox.java
 Sun Oct 14 05:24:19 2007
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.mspowerpoint;
+package org.apache.tika.parser.microsoft;
 
 /**
  * Package protected class for the MS Powerpoint TextBox content

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/chp/Word6CHPBinTable.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/chp/Word6CHPBinTable.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/chp/Word6CHPBinTable.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6CHPBinTable.java
 Sun Oct 14 05:24:19 2007
@@ -13,7 +13,7 @@
  *  limitations under the License.
  */
 
-package org.apache.tika.parser.msword.chp;
+package org.apache.tika.parser.microsoft;
 
 import java.util.List;
 import java.util.ArrayList;

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/Word6Extractor.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/Word6Extractor.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/Word6Extractor.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
 Sun Oct 14 05:24:19 2007
@@ -13,10 +13,9 @@
  *  limitations under the License.
  */
 
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
 
 
-import org.apache.tika.parser.msword.chp.Word6CHPBinTable;
 
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.hwpf.model.*;

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=584532&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
 Sun Oct 14 05:24:19 2007
@@ -0,0 +1,194 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.poi.hwpf.model.CHPBinTable;
+import org.apache.poi.hwpf.model.CHPX;
+import org.apache.poi.hwpf.model.ComplexFileTable;
+import org.apache.poi.hwpf.model.TextPiece;
+import org.apache.poi.hwpf.model.TextPieceTable;
+import org.apache.poi.hwpf.sprm.SprmIterator;
+import org.apache.poi.hwpf.sprm.SprmOperation;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * Word parser
+ */
+public class WordParser extends MSParser {
+
+    /**
+     * Gets the text from a Word document.
+     *
+     * @param in The InputStream representing the Word file.
+     */
+    public String extractText(InputStream in) throws Exception {
+        POIFSFileSystem fsys = new POIFSFileSystem(in);
+
+        // load our POIFS document streams.
+        DocumentEntry headerProps =
+            (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
+        DocumentInputStream din = 
fsys.createDocumentInputStream("WordDocument");
+        byte[] header = new byte[headerProps.getSize()];
+
+        din.read(header);
+        din.close();
+
+        int info = LittleEndian.getShort(header, 0xa);
+        if ((info & 0x4) != 0) {
+            throw new FastSavedException(
+                    "Fast-saved files are unsupported at this time");
+        }
+        if ((info & 0x100) != 0) {
+            throw new PasswordProtectedException(
+                    "This document is password protected");
+        }
+
+        // determine the version of Word this document came from.
+        int nFib = LittleEndian.getShort(header, 0x2);
+        switch (nFib) {
+        case 101:
+        case 102:
+        case 103:
+        case 104:
+            // this is a Word 6.0 doc send it to the extractor for that 
version.
+            Word6Extractor oldExtractor = new Word6Extractor();
+            return oldExtractor.extractText(header);
+        }
+
+        //get the location of the piece table
+        int complexOffset = LittleEndian.getInt(header, 0x1a2);
+
+        // determine which table stream we must use.
+        //Get the information we need from the header
+        String tableName = null;
+        boolean useTable1 = (info & 0x200) != 0;
+        if (useTable1) {
+            tableName = "1Table";
+        } else {
+            tableName = "0Table";
+        }
+
+        DocumentEntry table = 
(DocumentEntry)fsys.getRoot().getEntry(tableName);
+        byte[] tableStream = new byte[table.getSize()];
+
+        din = fsys.createDocumentInputStream(tableName);
+
+        din.read(tableStream);
+        din.close();
+
+        int chpOffset = LittleEndian.getInt(header, 0xfa);
+        int chpSize = LittleEndian.getInt(header, 0xfe);
+        int fcMin = LittleEndian.getInt(header, 0x18);
+        CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, 
chpSize, fcMin);
+
+        // load our text pieces and our character runs
+        ComplexFileTable cft = new ComplexFileTable(header, tableStream, 
complexOffset, fcMin);
+        TextPieceTable tpt = cft.getTextPieceTable();
+        List textPieces = tpt.getTextPieces();
+
+        // make the POIFS objects available for garbage collection
+        din = null;
+        fsys = null;
+        table = null;
+        headerProps = null;
+
+        List textRuns = cbt.getTextRuns();
+        Iterator runIt = textRuns.iterator();
+        Iterator textIt = textPieces.iterator();
+
+        TextPiece currentPiece = (TextPiece)textIt.next();
+        int currentTextStart = currentPiece.getStart();
+        int currentTextEnd = currentPiece.getEnd();
+
+        WordTextBuffer finalTextBuf = new WordTextBuffer();
+
+        // iterate through all text runs extract the text only if they haven't 
been
+        // deleted
+        while (runIt.hasNext()) {
+            CHPX chpx = (CHPX)runIt.next();
+            boolean deleted = isDeleted(chpx.getGrpprl());
+            if (deleted) {
+                continue;
+            }
+
+            int runStart = chpx.getStart();
+            int runEnd = chpx.getEnd();
+
+            while (runStart >= currentTextEnd) {
+                currentPiece = (TextPiece) textIt.next ();
+                currentTextStart = currentPiece.getStart ();
+                currentTextEnd = currentPiece.getEnd ();
+            }
+
+            if (runEnd < currentTextEnd) {
+                String str = currentPiece.substring(runStart - 
currentTextStart, runEnd - currentTextStart);
+                finalTextBuf.append(str);
+            } else if (runEnd > currentTextEnd) {
+                while (runEnd > currentTextEnd) {
+                    String str = currentPiece.substring(runStart - 
currentTextStart,
+                            currentTextEnd - currentTextStart);
+                    finalTextBuf.append(str);
+                    if (textIt.hasNext()) {
+                        currentPiece = (TextPiece) textIt.next ();
+                        currentTextStart = currentPiece.getStart ();
+                        runStart = currentTextStart;
+                        currentTextEnd = currentPiece.getEnd ();
+                    } else {
+                        return finalTextBuf.toString();
+                    }
+                }
+                String str = currentPiece.substring(0, runEnd - 
currentTextStart);
+                finalTextBuf.append(str);
+            } else {
+                String str = currentPiece.substring(runStart - 
currentTextStart, runEnd - currentTextStart);
+                if (textIt.hasNext()) {
+                    currentPiece = (TextPiece) textIt.next();
+                    currentTextStart = currentPiece.getStart();
+                    currentTextEnd = currentPiece.getEnd();
+                }
+                finalTextBuf.append(str);
+            }
+        }
+        return finalTextBuf.toString();
+    }
+
+    /**
+     * Used to determine if a run of text has been deleted.
+     *
+     * @param grpprl The list of sprms for a particular run of text.
+     * @return true if this run of text has been deleted.
+     */
+    private boolean isDeleted(byte[] grpprl) {
+        SprmIterator iterator = new SprmIterator(grpprl,0);
+        while (iterator.hasNext()) {
+            SprmOperation op = iterator.next();
+            // 0 is the operation that signals a FDelRMark operation
+            if (op.getOperation() == 0 && op.getOperand() != 0) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextBuffer.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextBuffer.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextBuffer.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java
 Sun Oct 14 05:24:19 2007
@@ -12,7 +12,7 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
 
 
 /**

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java
 (from r584528, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextPiece.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextPiece.java&r1=584528&r2=584532&rev=584532&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/msword/WordTextPiece.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextPiece.java
 Sun Oct 14 05:24:19 2007
@@ -13,7 +13,7 @@
  *  limitations under the License.
  */
 
-package org.apache.tika.parser.msword;
+package org.apache.tika.parser.microsoft;
 
 /**
  * This class stores info about the data structure describing a chunk of text

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=584532&r1=584531&r2=584532&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Oct 14 05:24:19 
2007
@@ -9,15 +9,15 @@
                 <mime>application/xml</mime>
         </parser>
 
-        <parser name="parse-msword" 
class="org.apache.tika.parser.msword.MsWordParser">
+        <parser name="parse-msword" 
class="org.apache.tika.parser.microsoft.WordParser">
                 <mime>application/msword</mime>
         </parser>
 
-        <parser name="parse-msexcel" 
class="org.apache.tika.parser.msexcel.MsExcelParser">
+        <parser name="parse-msexcel" 
class="org.apache.tika.parser.microsoft.ExcelParser">
                 <mime>application/vnd.ms-excel</mime>
         </parser>
 
-        <parser name="parse-mspowerpoint" 
class="org.apache.tika.parser.mspowerpoint.MsPowerPointParser">
+        <parser name="parse-mspowerpoint" 
class="org.apache.tika.parser.microsoft.PowerPointParser">
                 <mime>application/vnd.ms-powerpoint</mime>
         </parser>
 


Reply via email to