microsoft: ExcelParser.java OfficeParser.java PowerPointExtractor.java PowerPointParser.java WordParser.java

jukka Sun, 17 Feb 2008 03:41:04 -0800

Author: jukka
Date: Sun Feb 17 03:40:20 2008
New Revision: 628475

URL: http://svn.apache.org/viewvc?rev=628475&view=rev
Log:
TIKA-123: Structured MS Office parsing
    - Changed OfficeParser to allow structured parsing in subclasses
    - ExcelParser now outputs XHTML tables with nice tabs and line breaks
    - Dropped unused formatting code from ExcelParser (TODO fix that)
    - Streamlined PowerPointParser and started using Java 5 features
    - No functional changes (yet) in PowerPointParser
    - No functional changes (yet) in WordParser


Removed:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
Modified:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java?rev=628475&r1=628474&r2=628475&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
 Sun Feb 17 03:40:20 2008
@@ -19,10 +19,10 @@
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFListener;
 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
@@ -41,11 +41,12 @@
 import org.apache.poi.hssf.record.RKRecord;
 import org.apache.poi.hssf.record.Record;
 import org.apache.poi.hssf.record.SSTRecord;
-import org.apache.poi.hssf.record.UnicodeString;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
  * Excel parser implementation which uses POI's Event API
@@ -115,16 +116,18 @@
      * to the specified [EMAIL PROTECTED] Appendable}.
      *
      * @param filesystem POI file system
-     * @param appendable Where to output the parsed contents
      * @throws IOException if an error occurs processing the workbook
      * or writing the extracted content
      */
-    protected void extractText(final POIFSFileSystem filesystem,
-            final Appendable appendable) throws IOException {
+    protected void parse(
+            POIFSFileSystem filesystem, ContentHandler handler, Metadata 
metadata)
+            throws IOException, SAXException {
         log.debug("Starting listenForAllRecords=" + listenForAllRecords);
 
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
         // Set up listener and register the records we want to process
-        TikaHSSFListener listener = new TikaHSSFListener(appendable);
+        TikaHSSFListener listener = new TikaHSSFListener(xhtml);
         HSSFRequest hssfRequest = new HSSFRequest();
         if (listenForAllRecords) {
             hssfRequest.addListenerForAllRecords(listener);
@@ -147,9 +150,11 @@
         // Create event factory and process Workbook (fire events)
         DocumentInputStream documentInputStream = 
filesystem.createDocumentInputStream("Workbook");
         HSSFEventFactory eventFactory = new HSSFEventFactory();
-        eventFactory.processEvents(hssfRequest, documentInputStream);
 
-        log.debug("Processed " + listener.getRecordCount() + " records");
+        xhtml.startDocument();
+        eventFactory.processEvents(hssfRequest, documentInputStream);
+        listener.throwStoredException();
+        xhtml.endDocument();
     }
 
     // ======================================================================
@@ -162,38 +167,29 @@
         /** Logging instance */
         private static Log log = LogFactory.getLog(ExcelParser.class);
 
-        private final Appendable appendable;
-        private int recordCount;
+        private final XHTMLContentHandler handler;
+
+        private SAXException exception;
+
         private SSTRecord sstRecord;
-        private Map<Short, String> formats        = new HashMap<Short, 
String>();
-        private Map<Short, Short> extendedFormats = new HashMap<Short, 
Short>();
         private List<String> sheetNames = new ArrayList<String>();
-        private short bofRecordType;
-        private short defualtCountry;
-        private short currentCountry;
-        private short currentXFormatIdx;
         private short currentSheetIndex;
-        private String currentSheetName;
-        private boolean firstElement = true;
-        private boolean use1904windowing = false;
 
-        /**
-         * Contstruct a new listener instance outputting parsed data to
-         * the specified Appendable.
-         *
-         * @param appendable Destination to write the parsed output to
-         */
-        private TikaHSSFListener(final Appendable appendable) {
-            this.appendable = appendable;
-        }
+        private boolean insideWorksheet = false;
+
+        private int currentRow;
+
+        private short currentColumn;
 
         /**
-         * Return a count of the number of records processed.
+         * Contstruct a new listener instance outputting parsed data to
+         * the specified XHTML content handler.
          *
-         * @return The number of records processed by this listener
+         * @param handler Destination to write the parsed output to
          */
-        private int getRecordCount() {
-            return recordCount;
+        private TikaHSSFListener(XHTMLContentHandler handler) {
+            this.handler = handler;
+            this.exception = null;
         }
 
         /**
@@ -201,64 +197,70 @@
          *
          * @param record HSSF Record
          */
-        public void processRecord(final Record record) {
-            recordCount++;
-            final short sid = record.getSid();
-            switch (sid) {
+        public void processRecord(Record record) {
+            try {
+                if (log.isDebugEnabled()) {
+                    log.debug(record.toString());
+                }
+                internalProcessRecord(record);
+            } catch (SAXException e) {
+                if (exception == null) {
+                    exception = e;
+                }
+            }
+        }
+
+        public void throwStoredException() throws SAXException {
+            if (exception != null) {
+                throw exception;
+            }
+        }
+
+        private void internalProcessRecord(Record record) throws SAXException {
+            switch (record.getSid()) {
 
                 /* BOFRecord: indicates start of workbook, worksheet etc. 
records */
                 case BOFRecord.sid:
-                    BOFRecord bofRecord = (BOFRecord)record;
-                    bofRecordType = bofRecord.getType();
-                    switch (bofRecordType) {
+                    switch (((BOFRecord) record).getType()) {
                         case BOFRecord.TYPE_WORKBOOK:
                             currentSheetIndex = -1;
-                            debug(record, ".Workbook");
                             break;
                         case BOFRecord.TYPE_WORKSHEET:
                             currentSheetIndex++;
-                            currentSheetName = null;
+                            String currentSheetName = "";
                             if (currentSheetIndex < sheetNames.size()) {
                                 currentSheetName = 
sheetNames.get(currentSheetIndex);
                             }
-                            debug(record,
-                                    ".Worksheet[" + currentSheetIndex
-                                    + "], Name=[" + currentSheetName + "]");
-                            addText(currentSheetName);
-                            break;
-                        default:
-                            debug(record, "[" + bofRecordType + "]");
+                            handler.startElement("div", "class", "page");
+                            handler.element("h1", currentSheetName);
+                            handler.characters("\n");
+                            handler.startElement("table");
+                            handler.startElement("tbody");
+                            handler.startElement("tr");
+                            handler.startElement("td");
+                            insideWorksheet = true;
+                            currentRow = 0;
+                            currentColumn = 0;
                             break;
                     }
                     break;
 
-                /* BOFRecord: indicates end of workbook, worksheet etc. 
records */
+                /* EOFRecord: indicates end of workbook, worksheet etc. 
records */
                 case EOFRecord.sid:
-                    debug(record, "");
-                    bofRecordType = 0;
-                    break;
-
-                /* Indicates whether to use 1904 Date Windowing or not */
-                case DateWindow1904Record.sid:
-                    DateWindow1904Record dw1904Rec = 
(DateWindow1904Record)record;
-                    use1904windowing = (dw1904Rec.getWindowing() == 1);
-                    debug(record, "[" + use1904windowing + "]");
-                    break;
-
-                /* CountryRecord: holds all the strings for LabelSSTRecords */
-                case CountryRecord.sid:
-                    CountryRecord countryRecord = (CountryRecord)record;
-                    defualtCountry = countryRecord.getDefaultCountry();
-                    currentCountry = countryRecord.getCurrentCountry();
-                    debug(record,
-                            " default=[" + defualtCountry
-                            + "], current=[" + currentCountry + "]");
+                    if (insideWorksheet) {
+                        handler.endElement("td");
+                        handler.endElement("tr");
+                        handler.endElement("tbody");
+                        handler.endElement("table");
+                        handler.endElement("div");
+                        handler.characters("\n");
+                        insideWorksheet = false;
+                    }
                     break;
 
                 /* SSTRecord: holds all the strings for LabelSSTRecords */
                 case SSTRecord.sid:
                     sstRecord = (SSTRecord)record;
-                    debug(record, "");
                     break;
 
                 /* BoundSheetRecord: Worksheet index record */
@@ -266,41 +268,14 @@
                     BoundSheetRecord boundSheetRecord = 
(BoundSheetRecord)record;
                     String sheetName = boundSheetRecord.getSheetname();
                     sheetNames.add(sheetName);
-                    debug(record,
-                            "[" + sheetNames.size()
-                            + "], Name=[" + sheetName + "]");
-                    break;
-
-                /* FormatRecord */
-                case FormatRecord.sid:
-                    FormatRecord formatRecord = (FormatRecord)record;
-                    String dataFormat = formatRecord.getFormatString();
-                    short formatIdx = formatRecord.getIndexCode();
-                    formats.put(formatIdx, dataFormat);
-                    debug(record, "[" + formatIdx + "]=[" + dataFormat + "]");
-                    break;
-
-                /* ExtendedFormatRecord */
-                case ExtendedFormatRecord.sid:
-                    ExtendedFormatRecord xFormatRecord = 
(ExtendedFormatRecord)record;
-                    if (xFormatRecord.getXFType() == 
ExtendedFormatRecord.XF_CELL) {
-                        short dataFormatIdx = xFormatRecord.getFormatIndex();
-                        if (dataFormatIdx > 0) {
-                            extendedFormats.put(currentXFormatIdx, 
dataFormatIdx);
-                            debug(record,
-                                    "[" + currentXFormatIdx
-                                    + "]=FormatRecord[" + dataFormatIdx + "]");
-                        }
-                    }
-                    currentXFormatIdx++;
                     break;
 
                 default:
-                    if (bofRecordType == BOFRecord.TYPE_WORKSHEET
+                    if (insideWorksheet
                             && record instanceof CellValueRecordInterface) {
-                        processCellValue(sid, 
(CellValueRecordInterface)record);
-                    } else {
-                        debug(record, "");
+                        processCellValue(
+                                record.getSid(),
+                                (CellValueRecordInterface)record);
                     }
                     break;
             }
@@ -312,102 +287,57 @@
          * @param sid record type identifier
          * @param record The cell value record
          */
-        private void processCellValue(final short sid,
-                final CellValueRecordInterface record) {
+        private void processCellValue(
+                short sid, CellValueRecordInterface record)
+                throws SAXException {
+            while (currentRow < record.getRow()) {
+                handler.endElement("td");
+                handler.endElement("tr");
+                handler.characters("\n");
+                handler.startElement("tr");
+                handler.startElement("td");
+                currentRow++;
+                currentColumn = 0;
+            }
+            while (currentColumn < record.getColumn()) {
+                handler.endElement("td");
+                handler.characters("\t");
+                handler.startElement("td");
+                currentColumn++;
+            }
 
-            short xfIdx = record.getXFIndex();
-            Short dfIdx = extendedFormats.get(xfIdx);
-            String dataFormat = dfIdx != null ? formats.get(dfIdx) : null;
-            String str = null;
             switch (sid) {
-
                 /* FormulaRecord: Cell value from a formula */
                 case FormulaRecord.sid:
                     FormulaRecord formulaRecord = (FormulaRecord)record;
                     double fmlValue = formulaRecord.getValue();
-                    str = toString(fmlValue, dfIdx, dataFormat);
-                    str = addText(str);
+                    addText(Double.toString(fmlValue));
                     break;
 
                 /* LabelRecord: strings stored directly in the cell */
                 case LabelRecord.sid:
-                    LabelRecord labelRecord = (LabelRecord)record;
-                    str = addText(labelRecord.getValue());
+                    addText(((LabelRecord) record).getValue());
                     break;
 
                 /* LabelSSTRecord: Ref. a string in the shared string table */
                 case LabelSSTRecord.sid:
-                    LabelSSTRecord labelSSTRecord = (LabelSSTRecord)record;
+                    LabelSSTRecord labelSSTRecord = (LabelSSTRecord) record;
                     int sstIndex = labelSSTRecord.getSSTIndex();
-                    UnicodeString unicodeStr = sstRecord.getString(sstIndex);
-                    str = addText(unicodeStr.getString());
+                    addText(sstRecord.getString(sstIndex).getString());
                     break;
 
                 /* NumberRecord: Contains a numeric cell value */
                 case NumberRecord.sid:
                     double numValue = ((NumberRecord)record).getValue();
-                    if (!Double.isNaN(numValue)) {
-                        str = Double.toString(numValue);
-                    }
-                    str = toString(numValue, dfIdx, dataFormat);
-                    str = addText(str);
+                    addText(Double.toString(numValue));
                     break;
 
                 /* RKRecord: Excel internal number record */
                 case RKRecord.sid:
                     double rkValue = ((RKRecord)record).getRKNumber();
-                    str = toString(rkValue, dfIdx, dataFormat);
-                    str = addText(str);
+                    addText(Double.toString(rkValue));
                     break;
             }
-
-            // =========== Debug Mess: START ===========
-            if (log.isDebugEnabled()) {
-                StringBuilder builder = new StringBuilder();
-                builder.append('[');
-                // 
builder.append(ExcelUtils.columnIndexToLabel(record.getColumn()));
-                builder.append(record.getColumn());
-                builder.append(":");
-                builder.append((record.getRow() + 1));
-                builder.append(']');
-                if (dfIdx != null) {
-                    builder.append(" xfIdx[");
-                    builder.append(xfIdx).append(']');
-                    builder.append("=dfIdx[");
-                    builder.append(dfIdx);
-                    builder.append(']');
-                    if (dataFormat != null) {
-                        builder.append("=[");
-                        builder.append(dataFormat);
-                        builder.append(']');
-                    }
-                }
-                builder.append(", value=[");
-                if (str != null && str.length() > 0) {
-                    builder.append(str);
-                }
-                builder.append(']');
-                debug((Record)record, builder.toString());
-            }
-            // =========== Debug Mess: END =============
-        }
-
-        /**
-         * Converts a numeric excel cell value to a String.
-         *
-         * @param value The cell value
-         * @param dfIdx The data format index
-         * @param dataFormat The data format
-         * @return Formatted string value
-         */
-        private String toString(double value, Short dfIdx, String dataFormat) {
-            if (Double.isNaN(value)) {
-                return null;
-            }
-
-            // **** TODO: Data Format parsing ****
-            // return ExcelUtils.format(value, dfIdx, dataFormat, 
use1904windowing);
-            return Double.toString(value);
         }
 
         /**
@@ -416,43 +346,15 @@
          * Null and zero length values are ignored.
          *
          * @param text The text value
-         * @return the added text
          */
-        private String addText(String text) {
+        private void addText(String text) throws SAXException {
             if (text != null) {
                 text = text.trim();
                 if (text.length() > 0) {
-                    try {
-                        if (!firstElement) {
-                            appendable.append(" ");
-                        }
-                        appendable.append(text);
-                    } catch (Exception e) {
-                        throw new RuntimeException(e);
-                    }
-                    firstElement = false;
+                    handler.characters(text);
                 }
             }
-            return text;
         }
 
-        /**
-         * Record debugging.
-         *
-         * @param record The Record
-         * @param msg Debug Message
-         */
-        private void debug(Record record, String msg) {
-            if (log.isDebugEnabled()) {
-                String className = record.getClass().getSimpleName();
-                String text = (msg == null ? className :  className + msg);
-                if (record.getSid() == BOFRecord.sid ||
-                    record.getSid() == EOFRecord.sid) {
-                    log.debug(text);
-                } else {
-                    log.debug("    " + text);
-                }
-            }
-        }
     }
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=628475&r1=628474&r2=628475&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Sun Feb 17 03:40:20 2008
@@ -29,8 +29,6 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.AppendableAdaptor;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -54,13 +52,7 @@
                 filesystem, DocumentSummaryInformation.DEFAULT_STREAM_NAME,
                 metadata);
 
-        XHTMLContentHandler xhtml =
-            new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        xhtml.startElement("p");
-        extractText(filesystem, new AppendableAdaptor(xhtml));
-        xhtml.endElement("p");
-        xhtml.endDocument();
+        parse(filesystem, handler, metadata);
     }
 
     /**
@@ -73,8 +65,9 @@
     /**
      * Extracts the text content from a Microsoft document input stream.
      */
-    protected abstract void extractText(POIFSFileSystem filesystem, Appendable 
appendable)
-        throws IOException, TikaException;
+    protected abstract void parse(
+            POIFSFileSystem filesystem, ContentHandler handler, Metadata 
metadata)
+            throws IOException, SAXException, TikaException;
 
     private void getMetadata(
             POIFSFileSystem filesystem, String name, Metadata metadata) {

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java?rev=628475&r1=628474&r2=628475&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
 Sun Feb 17 03:40:20 2008
@@ -18,8 +18,20 @@
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
 
+import org.apache.log4j.Logger;
+import org.apache.poi.hdf.extractor.Utils;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
  * Power point parser
@@ -29,20 +41,407 @@
     /**
      *  Name of a PowerPoint document within a POIFS file system
      */
-    private  static final String POWERPOINT = "PowerPoint Document";
+    private static final String POWERPOINT = "PowerPoint Document";
+
+    /**
+     * Logger instance.
+     */
+    private static final Logger LOG = Logger.getLogger(PowerPointParser.class);
 
     protected String getContentType() {
         return "application/vnd.ms-powerpoint";
     }
 
-    protected void extractText(POIFSFileSystem filesystem, Appendable builder) 
throws IOException {
-
-        InputStream stream = filesystem.createDocumentInputStream(POWERPOINT);
+    protected void parse(
+            POIFSFileSystem poifs, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        InputStream stream = poifs.createDocumentInputStream(POWERPOINT);
         try {
-            new PowerPointExtractor(builder).extract(stream);
+            xhtml.startDocument();
+            xhtml.startElement("p");
+            parse(stream, xhtml);
+            xhtml.startElement("e");
+            xhtml.endDocument();
         } finally {
             stream.close();
         }
+    }
+
+    /**
+     * Reads the internal PowerPoint document stream.
+     */
+    private void parse(InputStream dis, XHTMLContentHandler xhtml) {
+        try {
+            final byte pptdata[] = new byte[dis.available()];
+            dis.read(pptdata, 0, dis.available());
+            int offset = 0;
+            long offsetPD = 0;
+
+            /*
+             * Traverse Bytearray to get CurrentUserEditAtom Call to extract 
the
+             * Text in all PlaceHolders to hold PPTClientTextBox objects for
+             * mapping into Slide Objects
+             */
+            Map<Long, TextBox> containerTextBox = new HashMap<Long, TextBox>();
+            // Traverse ByteArray to identiy edit paths of ClientTextBoxes
+            long n = pptdata.length - 20;
+            for (long i = 0; i < n; i++) {
+
+                final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
+                // final long size = LittleEndian.getUInt(pptdata, (int) i + 
4);
+
+                if (PPTConstants.PPT_ATOM_USEREDIT == type) {
+                    /*
+                     * Checking the Record Header (UserEditAtom)
+                     */
+                    // final long lastSlideID = LittleEndian.getInt(pptdata,
+                    // (int) i + 8);
+                    // final long version = LittleEndian.getUInt(pptdata, (int)
+                    // i + 12);
+                    offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
+                    offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
+
+                    /*
+                     * Call to extract ClientTextBox text in each UserEditAtom
+                     */
+                    extractTextBoxes(containerTextBox, offset, pptdata, 
offsetPD);
+                } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
+                    // if (LOG.isTraceEnabled()) {
+                    // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
+                    // }
+                } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
+                    // if (LOG.isTraceEnabled()) {
+                    // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
+                    // }
+                } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
+                    // if (LOG.isTraceEnabled()) {
+                    // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
+                    // }
+                } else {
+                    // no action
+                    // if (LOG.isTraceEnabled()) {
+                    // LOG.trace("type not handled: " + type);
+                    // }
+                }
+            }
+
+            List<Slide> slides = extractSlides(offset, pptdata, offsetPD);
+
+            if (slides.size() == 0) {
+                if (LOG.isInfoEnabled()) {
+                    LOG.info("No slides extracted!");
+                }
+
+            } else {
+                Slide slide = (Slide) slides.get(slides.size() - 1);
+
+                for (TextBox textBox : containerTextBox.values()) {
+                    slide.addContent(textBox.getContent());
+                }
+
+                /*
+                 * Merging TextBox data with Slide Data Printing the text from
+                 * Slides vector object.
+                 */
+                for (Slide s : slides) {
+                    List scontent = s.getContent();
+                    for (int j = 0; j < scontent.size(); j++) {
+                        String contentText = scontent.get(j).toString();
+                        xhtml.characters(contentText);
+
+                        // to avoid concatinated words we add a blank 
additional
+                        if (contentText.length() > 0
+                                && !(contentText.endsWith("\r") || contentText
+                                        .endsWith("\n"))) {
+                            xhtml.characters(" ");
+                        }
+                    }
+                }
+            }
+        } catch (Throwable ex) {
+            // because of not killing complete crawling all Throwables are
+            // catched.
+
+            LOG.error("processPOIFSReaderEvent", ex);
+        }
+    }
+
+    /**
+     * Extracts the client text boxes of a slide.
+     * 
+     * @param containerTextBox
+     * @param offset
+     * @param pptdata
+     * @param offsetPD
+     * @see TextBox
+     */
+    private void extractTextBoxes(
+            Map<Long, TextBox> containerTextBox,
+            int offset, byte[] pptdata, long offsetPD) {
+
+        // To hold temporary data
+        FilteredStringWriter outStream = new FilteredStringWriter();
+
+        TextBox textBox;
+
+        // Traversing the bytearray up to Presist directory position
+        for (int i = offset; i < offsetPD - 20; i++) {
+            try {
+                // Record info
+                // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
+                // Record Type
+                final long recordType = LittleEndian.getUShort(pptdata, i + 2);
+                // Record Size
+                final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
+
+                if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+                    /*
+                     * Record type is of Drawing Group
+                     */
+
+                    // Total number of objects
+                    // final long objectCount = LittleEndian.getUInt(pptdata,
+                    // (int) i +
+                    // 8);
+                    // currentID = Group ID+number of objects
+                    long currentID = LittleEndian.getInt(pptdata, i + 12);
+                    currentID = ((int) (currentID / 1024)) * 1024;
+
+                    if (currentID == PPTConstants.PPT_MASTERSLIDE) {
+                        // Ignore Master Slide objects
+                        if (LOG.isTraceEnabled()) {
+                            LOG.trace("Ignore master slide.");
+                        }
+                        i++;
+                        continue;
+                    }
+
+                    // Check for the ClientTextBox GroupID existence
+                    if (containerTextBox.containsKey(new Long(currentID))) {
+                        // If exists get Client Textbox Group
+                        textBox = (TextBox) containerTextBox.get(new Long(
+                                currentID));
+                        textBox.setContent("");
+
+                    } else {
+                        textBox = new TextBox(currentID);
+                        containerTextBox.put(new Long(currentID), textBox);
+                    }
+
+                    /*
+                     * Iterating the bytearray for TextCharAtoms and
+                     * TextBytesAtom
+                     */
+                    if ((offsetPD - 20) != recordSize) {
+                        // TODO something wrong? Probably an OLE-Object, which
+                        // we ignore.
+                        if (LOG.isDebugEnabled()) {
+                            LOG.debug("offsetPD - 20=" + (offsetPD - 20)
+                                    + " recordsize=" + recordSize);
+                        }
+                    } else {
+                        for (int startPos = i + 8; startPos < offsetPD - 20
+                                && startPos < recordSize; startPos++) { // &&
+                            // startPos
+                            // <
+                            // recordSize??
+                            try {
+
+                                // Record info
+                                // final long nrinfo =
+                                // LittleEndian.getUShort(pptdata, (int) j);
+
+                                // Record Type
+                                final long ntype = LittleEndian.getUShort(
+                                        pptdata, startPos + 2);
+
+                                // Record size
+                                // Note that the size doesn't include the 8 
byte
+                                // atom header
+                                final long nsize = LittleEndian.getUInt(
+                                        pptdata, startPos + 4);
+
+                                if (ntype == 
PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+                                    /*
+                                     * Break the loop if next GroupID found
+                                     */
+                                    i = startPos - 1;
+                                    break;
+                                } else if (ntype == 
PPTConstants.PPT_ATOM_TEXTBYTE) {
+                                    // TextByteAtom record
+                                    outStream = new FilteredStringWriter();
+                                    long ii = 0;
+                                    for (ii = startPos + 6; ii <= startPos + 6
+                                            + nsize; ii++) {
+                                        // For loop to changed to a function
+                                        // if ((ii + 2) >= pptdata.length)
+                                        // break; // FIXME
+                                        outStream
+                                                .write((char) (pptdata[(int) 
ii + 2]));
+                                    }
+
+                                    // Setting the identified text for Current
+                                    // groupID
+                                    textBox.setContent(textBox.getContent()
+                                            + outStream.toString());
+
+                                } else if (ntype == 
PPTConstants.PPT_ATOM_TEXTCHAR) {
+                                    // TextCharAtom record
+
+                                    final String strTempContent = new String(
+                                            pptdata, startPos + 6,
+                                            (int) (nsize) + 2);
+                                    final byte bytes[] = strTempContent
+                                            .getBytes();
+                                    if (true) {
+                                        outStream = new FilteredStringWriter();
+                                        for (int ii = 0; ii < bytes.length - 
1; ii += 2) {
+                                            // For loop to changed to a 
function
+                                            outStream
+                                                    .write((char) (pptdata[ii 
+ 2]));
+                                        }
+                                        textBox.setContent(textBox.getContent()
+                                                + outStream.toString());
+                                    } else {
+                                        // this version is used within POI
+                                        String text = StringUtil
+                                                .getFromCompressedUnicode(
+                                                        bytes, 0, 
bytes.length);
+                                        textBox.setContent(textBox.getContent()
+                                                + text);
+                                    }
+
+                                } else {
+                                    // ignored
+                                    // if (LOG.isTraceEnabled()) {
+                                    // LOG.trace("Ignored atom type: " + type);
+                                    // }
+                                }
+                            } catch (Throwable e) {
+
+                                LOG.error("extractTextBoxes", e);
+
+                                break;
+                            }
+                        }
+                    }
+                } else {
+                    // Record type is ignored
+                    // if (LOG.isTraceEnabled()) {
+                    // LOG.trace("Ignored record type: " + type);
+                    // }
+                }
+            } catch (Throwable ee) {
+                LOG.error("extractClientTextBoxes", ee);
+            }
+        }
+    }
+
+    /**
+     * Returns the Powerpoint <code>Slide</code> s of document as vector.
+     * 
+     * @param offset
+     * @param pptdata
+     * @param offsetPD
+     * @return Vector of the powerpoint slides. Contains
+     *         <code>[EMAIL PROTECTED] Slide Slide}</code>
+     * @see Slide
+     */
+    private List<Slide> extractSlides(
+            long offset, byte[] pptdata, long offsetPD) {
+        int sNum = 0;
+
+        // List of all slides found
+        List<Slide> slides = new ArrayList<Slide>();
+
+        // current slide data
+        Slide currentSlide = null;
+
+        // To store data found in TextCharAtoms and TextBytesAtoms
+        FilteredStringWriter outStream;
+
+        for (long i = offset; i < pptdata.length - 20; i++) {
+            final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
+            final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
+
+            if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
+                /*
+                 * TextByteAtom record
+                 */
+                outStream = new FilteredStringWriter();
+
+                for (long ii = i + 6; (ii <= i + 6 + atomSize)
+                        && (ii + 2 < pptdata.length); ii++) {
+                    try {
+                        // if(ii+2 >= pptdata.length) break; //FIXME
+                        byte value = pptdata[(int) ii + 2];
+                        outStream.write(value);
+                    } catch (ArrayIndexOutOfBoundsException ex) {
+                        if (LOG.isTraceEnabled()) {
+                            LOG.trace("size=" + pptdata.length);
+                        }
+
+                        LOG.error("extractSlides", ex);
+
+                    }
+                }
+
+                // Setting the identified text for Current Slide
+                if (currentSlide != null) {
+                    currentSlide.addContent(outStream.toString());
+                }
+
+            } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
+                /*
+                 * TextCharAtom record
+                 */
+                outStream = new FilteredStringWriter();
+                final String strTempContent = new String(pptdata, (int) i + 6,
+                        (int) (atomSize) + 2);
+                final byte bytes[] = strTempContent.getBytes();
+
+                for (int ii = 0; ii < bytes.length - 1; ii += 2) {
+                    outStream.write(Utils.getUnicodeCharacter(bytes, ii));
+                }
+
+                // Setting the identified text for Current Slide
+                if (currentSlide != null) {
+                    currentSlide.addContent(outStream.toString());
+                }
+
+            } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
+                /*
+                 * SlidePresistAtom Record
+                 */
+                if (sNum != 0) {
+                    outStream = new FilteredStringWriter();
+
+                    final long slideID = LittleEndian.getUInt(pptdata,
+                            (int) i + 20);
+
+                    currentSlide = new Slide(slideID);
+                    // currentSlide.addContent(outStream.toString());
+                    slides.add(currentSlide);
+                }
+                sNum++;
+            } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+                /*
+                 * Diagram records are ignored
+                 */
+                if (LOG.isTraceEnabled()) {
+                    LOG.trace("Drawing Groups are ignored.");
+                }
+                break;
+            } else {
+                // ignored
+                // if (LOG.isTraceEnabled()) {
+                // LOG.trace("Unhandled atomType: " + atomType);
+                // }
+            }
+        }
+
+        return slides;
     }
 
 }

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=628475&r1=628474&r2=628475&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
 Sun Feb 17 03:40:20 2008
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import org.apache.log4j.xml.SAXErrorHandler;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Range;
@@ -24,6 +25,11 @@
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.LittleEndian;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.AppendableAdaptor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 import java.io.IOException;
 
@@ -42,8 +48,14 @@
      * @param fsys the <code>POIFSFileSystem</code> to read the word document 
from.
      * @param appendable the <code>Appendable</code> to add the text content 
to.
      */
-    public void extractText(POIFSFileSystem fsys, Appendable appendable)
-            throws IOException, TikaException {
+    public void parse(
+            POIFSFileSystem fsys, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.startElement("p");
+        Appendable appendable = new AppendableAdaptor(xhtml);
+
         // load our POIFS document streams.
         DocumentEntry headerProps =
             (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
@@ -91,11 +103,7 @@
             }
         }
 
-        // Set POI values to null
-        headerProps = null;
-        header = null;
-        din = null;
-        doc = null;
-        fsys = null;
+        xhtml.endElement("p");
+        xhtml.endDocument();
     }
 }

svn commit: r628475 - in /incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft: ExcelParser.java OfficeParser.java PowerPointExtractor.java PowerPointParser.java WordParser.java

Reply via email to