Author: jukka
Date: Mon Feb 18 15:08:30 2008
New Revision: 628913

URL: http://svn.apache.org/viewvc?rev=628913&view=rev
Log:
TIKA-123: Structured MS Office parsing
    - Replaced custom PowerPoint parser with PowerPointExtractor from POI HSLF 

Removed:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Slide.java
Modified:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java?rev=628913&r1=628912&r2=628913&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
 Mon Feb 18 15:08:30 2008
@@ -17,17 +17,9 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
 
-import org.apache.log4j.Logger;
-import org.apache.poi.hdf.extractor.Utils;
+import org.apache.poi.hslf.extractor.PowerPointExtractor;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.LittleEndian;
-import org.apache.poi.util.StringUtil;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
@@ -38,16 +30,6 @@
  */
 public class PowerPointParser extends OfficeParser {
 
-    /**
-     *  Name of a PowerPoint document within a POIFS file system
-     */
-    private static final String POWERPOINT = "PowerPoint Document";
-
-    /**
-     * Logger instance.
-     */
-    private static final Logger LOG = Logger.getLogger(PowerPointParser.class);
-
     protected String getContentType() {
         return "application/vnd.ms-powerpoint";
     }
@@ -56,392 +38,11 @@
             POIFSFileSystem poifs, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        InputStream stream = poifs.createDocumentInputStream(POWERPOINT);
-        try {
-            xhtml.startDocument();
-            xhtml.startElement("p");
-            parse(stream, xhtml);
-            xhtml.endElement("p");
-            xhtml.endDocument();
-        } finally {
-            stream.close();
-        }
-    }
-
-    /**
-     * Reads the internal PowerPoint document stream.
-     */
-    private void parse(InputStream dis, XHTMLContentHandler xhtml) {
-        try {
-            final byte pptdata[] = new byte[dis.available()];
-            dis.read(pptdata, 0, dis.available());
-            int offset = 0;
-            long offsetPD = 0;
-
-            /*
-             * Traverse Bytearray to get CurrentUserEditAtom Call to extract 
the
-             * Text in all PlaceHolders to hold PPTClientTextBox objects for
-             * mapping into Slide Objects
-             */
-            Map<Long, TextBox> containerTextBox = new HashMap<Long, TextBox>();
-            // Traverse ByteArray to identiy edit paths of ClientTextBoxes
-            long n = pptdata.length - 20;
-            for (long i = 0; i < n; i++) {
-
-                final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
-                // final long size = LittleEndian.getUInt(pptdata, (int) i + 
4);
-
-                if (PPTConstants.PPT_ATOM_USEREDIT == type) {
-                    /*
-                     * Checking the Record Header (UserEditAtom)
-                     */
-                    // final long lastSlideID = LittleEndian.getInt(pptdata,
-                    // (int) i + 8);
-                    // final long version = LittleEndian.getUInt(pptdata, (int)
-                    // i + 12);
-                    offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
-                    offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
-
-                    /*
-                     * Call to extract ClientTextBox text in each UserEditAtom
-                     */
-                    extractTextBoxes(containerTextBox, offset, pptdata, 
offsetPD);
-                } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
-                    // if (LOG.isTraceEnabled()) {
-                    // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
-                    // }
-                } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
-                    // if (LOG.isTraceEnabled()) {
-                    // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
-                    // }
-                } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
-                    // if (LOG.isTraceEnabled()) {
-                    // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
-                    // }
-                } else {
-                    // no action
-                    // if (LOG.isTraceEnabled()) {
-                    // LOG.trace("type not handled: " + type);
-                    // }
-                }
-            }
-
-            List<Slide> slides = extractSlides(offset, pptdata, offsetPD);
-
-            if (slides.size() == 0) {
-                if (LOG.isInfoEnabled()) {
-                    LOG.info("No slides extracted!");
-                }
-
-            } else {
-                Slide slide = (Slide) slides.get(slides.size() - 1);
-
-                for (TextBox textBox : containerTextBox.values()) {
-                    slide.addContent(textBox.getContent());
-                }
-
-                /*
-                 * Merging TextBox data with Slide Data Printing the text from
-                 * Slides vector object.
-                 */
-                for (Slide s : slides) {
-                    List scontent = s.getContent();
-                    for (int j = 0; j < scontent.size(); j++) {
-                        String contentText = scontent.get(j).toString();
-                        xhtml.characters(contentText);
-
-                        // to avoid concatinated words we add a blank 
additional
-                        if (contentText.length() > 0
-                                && !(contentText.endsWith("\r") || contentText
-                                        .endsWith("\n"))) {
-                            xhtml.characters(" ");
-                        }
-                    }
-                }
-            }
-        } catch (Throwable ex) {
-            // because of not killing complete crawling all Throwables are
-            // catched.
-
-            LOG.error("processPOIFSReaderEvent", ex);
-        }
-    }
-
-    /**
-     * Extracts the client text boxes of a slide.
-     * 
-     * @param containerTextBox
-     * @param offset
-     * @param pptdata
-     * @param offsetPD
-     * @see TextBox
-     */
-    private void extractTextBoxes(
-            Map<Long, TextBox> containerTextBox,
-            int offset, byte[] pptdata, long offsetPD) {
-
-        // To hold temporary data
-        FilteredStringWriter outStream = new FilteredStringWriter();
-
-        TextBox textBox;
-
-        // Traversing the bytearray up to Presist directory position
-        for (int i = offset; i < offsetPD - 20; i++) {
-            try {
-                // Record info
-                // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
-                // Record Type
-                final long recordType = LittleEndian.getUShort(pptdata, i + 2);
-                // Record Size
-                final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
-
-                if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
-                    /*
-                     * Record type is of Drawing Group
-                     */
-
-                    // Total number of objects
-                    // final long objectCount = LittleEndian.getUInt(pptdata,
-                    // (int) i +
-                    // 8);
-                    // currentID = Group ID+number of objects
-                    long currentID = LittleEndian.getInt(pptdata, i + 12);
-                    currentID = ((int) (currentID / 1024)) * 1024;
-
-                    if (currentID == PPTConstants.PPT_MASTERSLIDE) {
-                        // Ignore Master Slide objects
-                        if (LOG.isTraceEnabled()) {
-                            LOG.trace("Ignore master slide.");
-                        }
-                        i++;
-                        continue;
-                    }
-
-                    // Check for the ClientTextBox GroupID existence
-                    if (containerTextBox.containsKey(new Long(currentID))) {
-                        // If exists get Client Textbox Group
-                        textBox = (TextBox) containerTextBox.get(new Long(
-                                currentID));
-                        textBox.setContent("");
-
-                    } else {
-                        textBox = new TextBox(currentID);
-                        containerTextBox.put(new Long(currentID), textBox);
-                    }
-
-                    /*
-                     * Iterating the bytearray for TextCharAtoms and
-                     * TextBytesAtom
-                     */
-                    if ((offsetPD - 20) != recordSize) {
-                        // TODO something wrong? Probably an OLE-Object, which
-                        // we ignore.
-                        if (LOG.isDebugEnabled()) {
-                            LOG.debug("offsetPD - 20=" + (offsetPD - 20)
-                                    + " recordsize=" + recordSize);
-                        }
-                    } else {
-                        for (int startPos = i + 8; startPos < offsetPD - 20
-                                && startPos < recordSize; startPos++) { // &&
-                            // startPos
-                            // <
-                            // recordSize??
-                            try {
-
-                                // Record info
-                                // final long nrinfo =
-                                // LittleEndian.getUShort(pptdata, (int) j);
-
-                                // Record Type
-                                final long ntype = LittleEndian.getUShort(
-                                        pptdata, startPos + 2);
-
-                                // Record size
-                                // Note that the size doesn't include the 8 
byte
-                                // atom header
-                                final long nsize = LittleEndian.getUInt(
-                                        pptdata, startPos + 4);
-
-                                if (ntype == 
PPTConstants.PPT_ATOM_DRAWINGGROUP) {
-                                    /*
-                                     * Break the loop if next GroupID found
-                                     */
-                                    i = startPos - 1;
-                                    break;
-                                } else if (ntype == 
PPTConstants.PPT_ATOM_TEXTBYTE) {
-                                    // TextByteAtom record
-                                    outStream = new FilteredStringWriter();
-                                    long ii = 0;
-                                    for (ii = startPos + 6; ii <= startPos + 6
-                                            + nsize; ii++) {
-                                        // For loop to changed to a function
-                                        // if ((ii + 2) >= pptdata.length)
-                                        // break; // FIXME
-                                        outStream
-                                                .write((char) (pptdata[(int) 
ii + 2]));
-                                    }
-
-                                    // Setting the identified text for Current
-                                    // groupID
-                                    textBox.setContent(textBox.getContent()
-                                            + outStream.toString());
-
-                                } else if (ntype == 
PPTConstants.PPT_ATOM_TEXTCHAR) {
-                                    // TextCharAtom record
-
-                                    final String strTempContent = new String(
-                                            pptdata, startPos + 6,
-                                            (int) (nsize) + 2);
-                                    final byte bytes[] = strTempContent
-                                            .getBytes();
-                                    if (true) {
-                                        outStream = new FilteredStringWriter();
-                                        for (int ii = 0; ii < bytes.length - 
1; ii += 2) {
-                                            // For loop to changed to a 
function
-                                            outStream
-                                                    .write((char) (pptdata[ii 
+ 2]));
-                                        }
-                                        textBox.setContent(textBox.getContent()
-                                                + outStream.toString());
-                                    } else {
-                                        // this version is used within POI
-                                        String text = StringUtil
-                                                .getFromCompressedUnicode(
-                                                        bytes, 0, 
bytes.length);
-                                        textBox.setContent(textBox.getContent()
-                                                + text);
-                                    }
-
-                                } else {
-                                    // ignored
-                                    // if (LOG.isTraceEnabled()) {
-                                    // LOG.trace("Ignored atom type: " + type);
-                                    // }
-                                }
-                            } catch (Throwable e) {
-
-                                LOG.error("extractTextBoxes", e);
-
-                                break;
-                            }
-                        }
-                    }
-                } else {
-                    // Record type is ignored
-                    // if (LOG.isTraceEnabled()) {
-                    // LOG.trace("Ignored record type: " + type);
-                    // }
-                }
-            } catch (Throwable ee) {
-                LOG.error("extractClientTextBoxes", ee);
-            }
-        }
-    }
-
-    /**
-     * Returns the Powerpoint <code>Slide</code> s of document as vector.
-     * 
-     * @param offset
-     * @param pptdata
-     * @param offsetPD
-     * @return Vector of the powerpoint slides. Contains
-     *         <code>[EMAIL PROTECTED] Slide Slide}</code>
-     * @see Slide
-     */
-    private List<Slide> extractSlides(
-            long offset, byte[] pptdata, long offsetPD) {
-        int sNum = 0;
-
-        // List of all slides found
-        List<Slide> slides = new ArrayList<Slide>();
-
-        // current slide data
-        Slide currentSlide = null;
-
-        // To store data found in TextCharAtoms and TextBytesAtoms
-        FilteredStringWriter outStream;
-
-        for (long i = offset; i < pptdata.length - 20; i++) {
-            final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
-            final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
-
-            if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
-                /*
-                 * TextByteAtom record
-                 */
-                outStream = new FilteredStringWriter();
-
-                for (long ii = i + 6; (ii <= i + 6 + atomSize)
-                        && (ii + 2 < pptdata.length); ii++) {
-                    try {
-                        // if(ii+2 >= pptdata.length) break; //FIXME
-                        byte value = pptdata[(int) ii + 2];
-                        outStream.write(value);
-                    } catch (ArrayIndexOutOfBoundsException ex) {
-                        if (LOG.isTraceEnabled()) {
-                            LOG.trace("size=" + pptdata.length);
-                        }
-
-                        LOG.error("extractSlides", ex);
-
-                    }
-                }
-
-                // Setting the identified text for Current Slide
-                if (currentSlide != null) {
-                    currentSlide.addContent(outStream.toString());
-                }
-
-            } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
-                /*
-                 * TextCharAtom record
-                 */
-                outStream = new FilteredStringWriter();
-                final String strTempContent = new String(pptdata, (int) i + 6,
-                        (int) (atomSize) + 2);
-                final byte bytes[] = strTempContent.getBytes();
-
-                for (int ii = 0; ii < bytes.length - 1; ii += 2) {
-                    outStream.write(Utils.getUnicodeCharacter(bytes, ii));
-                }
-
-                // Setting the identified text for Current Slide
-                if (currentSlide != null) {
-                    currentSlide.addContent(outStream.toString());
-                }
-
-            } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
-                /*
-                 * SlidePresistAtom Record
-                 */
-                if (sNum != 0) {
-                    outStream = new FilteredStringWriter();
-
-                    final long slideID = LittleEndian.getUInt(pptdata,
-                            (int) i + 20);
-
-                    currentSlide = new Slide(slideID);
-                    // currentSlide.addContent(outStream.toString());
-                    slides.add(currentSlide);
-                }
-                sNum++;
-            } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
-                /*
-                 * Diagram records are ignored
-                 */
-                if (LOG.isTraceEnabled()) {
-                    LOG.trace("Drawing Groups are ignored.");
-                }
-                break;
-            } else {
-                // ignored
-                // if (LOG.isTraceEnabled()) {
-                // LOG.trace("Unhandled atomType: " + atomType);
-                // }
-            }
-        }
-
-        return slides;
+        xhtml.startDocument();
+        xhtml.startElement("p");
+        xhtml.characters(new PowerPointExtractor(poifs).getText(true, true));
+        xhtml.endElement("p");
+        xhtml.endDocument();
     }
 
 }


Reply via email to