Author: jukka
Date: Sun Feb 17 03:40:20 2008
New Revision: 628475
URL: http://svn.apache.org/viewvc?rev=628475&view=rev
Log:
TIKA-123: Structured MS Office parsing
- Changed OfficeParser to allow structured parsing in subclasses
- ExcelParser now outputs XHTML tables with nice tabs and line breaks
- Dropped unused formatting code from ExcelParser (TODO fix that)
- Streamlined PowerPointParser and started using Java 5 features
- No functional changes (yet) in PowerPointParser
- No functional changes (yet) in WordParser
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java?rev=628475&r1=628474&r2=628475&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
Sun Feb 17 03:40:20 2008
@@ -19,10 +19,10 @@
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
@@ -41,11 +41,12 @@
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
-import org.apache.poi.hssf.record.UnicodeString;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Excel parser implementation which uses POI's Event API
@@ -115,16 +116,18 @@
* to the specified [EMAIL PROTECTED] Appendable}.
*
* @param filesystem POI file system
- * @param appendable Where to output the parsed contents
* @throws IOException if an error occurs processing the workbook
* or writing the extracted content
*/
- protected void extractText(final POIFSFileSystem filesystem,
- final Appendable appendable) throws IOException {
+ protected void parse(
+ POIFSFileSystem filesystem, ContentHandler handler, Metadata
metadata)
+ throws IOException, SAXException {
log.debug("Starting listenForAllRecords=" + listenForAllRecords);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
// Set up listener and register the records we want to process
- TikaHSSFListener listener = new TikaHSSFListener(appendable);
+ TikaHSSFListener listener = new TikaHSSFListener(xhtml);
HSSFRequest hssfRequest = new HSSFRequest();
if (listenForAllRecords) {
hssfRequest.addListenerForAllRecords(listener);
@@ -147,9 +150,11 @@
// Create event factory and process Workbook (fire events)
DocumentInputStream documentInputStream =
filesystem.createDocumentInputStream("Workbook");
HSSFEventFactory eventFactory = new HSSFEventFactory();
- eventFactory.processEvents(hssfRequest, documentInputStream);
- log.debug("Processed " + listener.getRecordCount() + " records");
+ xhtml.startDocument();
+ eventFactory.processEvents(hssfRequest, documentInputStream);
+ listener.throwStoredException();
+ xhtml.endDocument();
}
// ======================================================================
@@ -162,38 +167,29 @@
/** Logging instance */
private static Log log = LogFactory.getLog(ExcelParser.class);
- private final Appendable appendable;
- private int recordCount;
+ private final XHTMLContentHandler handler;
+
+ private SAXException exception;
+
private SSTRecord sstRecord;
- private Map<Short, String> formats = new HashMap<Short,
String>();
- private Map<Short, Short> extendedFormats = new HashMap<Short,
Short>();
private List<String> sheetNames = new ArrayList<String>();
- private short bofRecordType;
- private short defualtCountry;
- private short currentCountry;
- private short currentXFormatIdx;
private short currentSheetIndex;
- private String currentSheetName;
- private boolean firstElement = true;
- private boolean use1904windowing = false;
- /**
- * Contstruct a new listener instance outputting parsed data to
- * the specified Appendable.
- *
- * @param appendable Destination to write the parsed output to
- */
- private TikaHSSFListener(final Appendable appendable) {
- this.appendable = appendable;
- }
+ private boolean insideWorksheet = false;
+
+ private int currentRow;
+
+ private short currentColumn;
/**
- * Return a count of the number of records processed.
+ * Contstruct a new listener instance outputting parsed data to
+ * the specified XHTML content handler.
*
- * @return The number of records processed by this listener
+ * @param handler Destination to write the parsed output to
*/
- private int getRecordCount() {
- return recordCount;
+ private TikaHSSFListener(XHTMLContentHandler handler) {
+ this.handler = handler;
+ this.exception = null;
}
/**
@@ -201,64 +197,70 @@
*
* @param record HSSF Record
*/
- public void processRecord(final Record record) {
- recordCount++;
- final short sid = record.getSid();
- switch (sid) {
+ public void processRecord(Record record) {
+ try {
+ if (log.isDebugEnabled()) {
+ log.debug(record.toString());
+ }
+ internalProcessRecord(record);
+ } catch (SAXException e) {
+ if (exception == null) {
+ exception = e;
+ }
+ }
+ }
+
+ public void throwStoredException() throws SAXException {
+ if (exception != null) {
+ throw exception;
+ }
+ }
+
+ private void internalProcessRecord(Record record) throws SAXException {
+ switch (record.getSid()) {
/* BOFRecord: indicates start of workbook, worksheet etc.
records */
case BOFRecord.sid:
- BOFRecord bofRecord = (BOFRecord)record;
- bofRecordType = bofRecord.getType();
- switch (bofRecordType) {
+ switch (((BOFRecord) record).getType()) {
case BOFRecord.TYPE_WORKBOOK:
currentSheetIndex = -1;
- debug(record, ".Workbook");
break;
case BOFRecord.TYPE_WORKSHEET:
currentSheetIndex++;
- currentSheetName = null;
+ String currentSheetName = "";
if (currentSheetIndex < sheetNames.size()) {
currentSheetName =
sheetNames.get(currentSheetIndex);
}
- debug(record,
- ".Worksheet[" + currentSheetIndex
- + "], Name=[" + currentSheetName + "]");
- addText(currentSheetName);
- break;
- default:
- debug(record, "[" + bofRecordType + "]");
+ handler.startElement("div", "class", "page");
+ handler.element("h1", currentSheetName);
+ handler.characters("\n");
+ handler.startElement("table");
+ handler.startElement("tbody");
+ handler.startElement("tr");
+ handler.startElement("td");
+ insideWorksheet = true;
+ currentRow = 0;
+ currentColumn = 0;
break;
}
break;
- /* BOFRecord: indicates end of workbook, worksheet etc.
records */
+ /* EOFRecord: indicates end of workbook, worksheet etc.
records */
case EOFRecord.sid:
- debug(record, "");
- bofRecordType = 0;
- break;
-
- /* Indicates whether to use 1904 Date Windowing or not */
- case DateWindow1904Record.sid:
- DateWindow1904Record dw1904Rec =
(DateWindow1904Record)record;
- use1904windowing = (dw1904Rec.getWindowing() == 1);
- debug(record, "[" + use1904windowing + "]");
- break;
-
- /* CountryRecord: holds all the strings for LabelSSTRecords */
- case CountryRecord.sid:
- CountryRecord countryRecord = (CountryRecord)record;
- defualtCountry = countryRecord.getDefaultCountry();
- currentCountry = countryRecord.getCurrentCountry();
- debug(record,
- " default=[" + defualtCountry
- + "], current=[" + currentCountry + "]");
+ if (insideWorksheet) {
+ handler.endElement("td");
+ handler.endElement("tr");
+ handler.endElement("tbody");
+ handler.endElement("table");
+ handler.endElement("div");
+ handler.characters("\n");
+ insideWorksheet = false;
+ }
break;
/* SSTRecord: holds all the strings for LabelSSTRecords */
case SSTRecord.sid:
sstRecord = (SSTRecord)record;
- debug(record, "");
break;
/* BoundSheetRecord: Worksheet index record */
@@ -266,41 +268,14 @@
BoundSheetRecord boundSheetRecord =
(BoundSheetRecord)record;
String sheetName = boundSheetRecord.getSheetname();
sheetNames.add(sheetName);
- debug(record,
- "[" + sheetNames.size()
- + "], Name=[" + sheetName + "]");
- break;
-
- /* FormatRecord */
- case FormatRecord.sid:
- FormatRecord formatRecord = (FormatRecord)record;
- String dataFormat = formatRecord.getFormatString();
- short formatIdx = formatRecord.getIndexCode();
- formats.put(formatIdx, dataFormat);
- debug(record, "[" + formatIdx + "]=[" + dataFormat + "]");
- break;
-
- /* ExtendedFormatRecord */
- case ExtendedFormatRecord.sid:
- ExtendedFormatRecord xFormatRecord =
(ExtendedFormatRecord)record;
- if (xFormatRecord.getXFType() ==
ExtendedFormatRecord.XF_CELL) {
- short dataFormatIdx = xFormatRecord.getFormatIndex();
- if (dataFormatIdx > 0) {
- extendedFormats.put(currentXFormatIdx,
dataFormatIdx);
- debug(record,
- "[" + currentXFormatIdx
- + "]=FormatRecord[" + dataFormatIdx + "]");
- }
- }
- currentXFormatIdx++;
break;
default:
- if (bofRecordType == BOFRecord.TYPE_WORKSHEET
+ if (insideWorksheet
&& record instanceof CellValueRecordInterface) {
- processCellValue(sid,
(CellValueRecordInterface)record);
- } else {
- debug(record, "");
+ processCellValue(
+ record.getSid(),
+ (CellValueRecordInterface)record);
}
break;
}
@@ -312,102 +287,57 @@
* @param sid record type identifier
* @param record The cell value record
*/
- private void processCellValue(final short sid,
- final CellValueRecordInterface record) {
+ private void processCellValue(
+ short sid, CellValueRecordInterface record)
+ throws SAXException {
+ while (currentRow < record.getRow()) {
+ handler.endElement("td");
+ handler.endElement("tr");
+ handler.characters("\n");
+ handler.startElement("tr");
+ handler.startElement("td");
+ currentRow++;
+ currentColumn = 0;
+ }
+ while (currentColumn < record.getColumn()) {
+ handler.endElement("td");
+ handler.characters("\t");
+ handler.startElement("td");
+ currentColumn++;
+ }
- short xfIdx = record.getXFIndex();
- Short dfIdx = extendedFormats.get(xfIdx);
- String dataFormat = dfIdx != null ? formats.get(dfIdx) : null;
- String str = null;
switch (sid) {
-
/* FormulaRecord: Cell value from a formula */
case FormulaRecord.sid:
FormulaRecord formulaRecord = (FormulaRecord)record;
double fmlValue = formulaRecord.getValue();
- str = toString(fmlValue, dfIdx, dataFormat);
- str = addText(str);
+ addText(Double.toString(fmlValue));
break;
/* LabelRecord: strings stored directly in the cell */
case LabelRecord.sid:
- LabelRecord labelRecord = (LabelRecord)record;
- str = addText(labelRecord.getValue());
+ addText(((LabelRecord) record).getValue());
break;
/* LabelSSTRecord: Ref. a string in the shared string table */
case LabelSSTRecord.sid:
- LabelSSTRecord labelSSTRecord = (LabelSSTRecord)record;
+ LabelSSTRecord labelSSTRecord = (LabelSSTRecord) record;
int sstIndex = labelSSTRecord.getSSTIndex();
- UnicodeString unicodeStr = sstRecord.getString(sstIndex);
- str = addText(unicodeStr.getString());
+ addText(sstRecord.getString(sstIndex).getString());
break;
/* NumberRecord: Contains a numeric cell value */
case NumberRecord.sid:
double numValue = ((NumberRecord)record).getValue();
- if (!Double.isNaN(numValue)) {
- str = Double.toString(numValue);
- }
- str = toString(numValue, dfIdx, dataFormat);
- str = addText(str);
+ addText(Double.toString(numValue));
break;
/* RKRecord: Excel internal number record */
case RKRecord.sid:
double rkValue = ((RKRecord)record).getRKNumber();
- str = toString(rkValue, dfIdx, dataFormat);
- str = addText(str);
+ addText(Double.toString(rkValue));
break;
}
-
- // =========== Debug Mess: START ===========
- if (log.isDebugEnabled()) {
- StringBuilder builder = new StringBuilder();
- builder.append('[');
- //
builder.append(ExcelUtils.columnIndexToLabel(record.getColumn()));
- builder.append(record.getColumn());
- builder.append(":");
- builder.append((record.getRow() + 1));
- builder.append(']');
- if (dfIdx != null) {
- builder.append(" xfIdx[");
- builder.append(xfIdx).append(']');
- builder.append("=dfIdx[");
- builder.append(dfIdx);
- builder.append(']');
- if (dataFormat != null) {
- builder.append("=[");
- builder.append(dataFormat);
- builder.append(']');
- }
- }
- builder.append(", value=[");
- if (str != null && str.length() > 0) {
- builder.append(str);
- }
- builder.append(']');
- debug((Record)record, builder.toString());
- }
- // =========== Debug Mess: END =============
- }
-
- /**
- * Converts a numeric excel cell value to a String.
- *
- * @param value The cell value
- * @param dfIdx The data format index
- * @param dataFormat The data format
- * @return Formatted string value
- */
- private String toString(double value, Short dfIdx, String dataFormat) {
- if (Double.isNaN(value)) {
- return null;
- }
-
- // **** TODO: Data Format parsing ****
- // return ExcelUtils.format(value, dfIdx, dataFormat,
use1904windowing);
- return Double.toString(value);
}
/**
@@ -416,43 +346,15 @@
* Null and zero length values are ignored.
*
* @param text The text value
- * @return the added text
*/
- private String addText(String text) {
+ private void addText(String text) throws SAXException {
if (text != null) {
text = text.trim();
if (text.length() > 0) {
- try {
- if (!firstElement) {
- appendable.append(" ");
- }
- appendable.append(text);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- firstElement = false;
+ handler.characters(text);
}
}
- return text;
}
- /**
- * Record debugging.
- *
- * @param record The Record
- * @param msg Debug Message
- */
- private void debug(Record record, String msg) {
- if (log.isDebugEnabled()) {
- String className = record.getClass().getSimpleName();
- String text = (msg == null ? className : className + msg);
- if (record.getSid() == BOFRecord.sid ||
- record.getSid() == EOFRecord.sid) {
- log.debug(text);
- } else {
- log.debug(" " + text);
- }
- }
- }
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=628475&r1=628474&r2=628475&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Sun Feb 17 03:40:20 2008
@@ -29,8 +29,6 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.AppendableAdaptor;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -54,13 +52,7 @@
filesystem, DocumentSummaryInformation.DEFAULT_STREAM_NAME,
metadata);
- XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.startElement("p");
- extractText(filesystem, new AppendableAdaptor(xhtml));
- xhtml.endElement("p");
- xhtml.endDocument();
+ parse(filesystem, handler, metadata);
}
/**
@@ -73,8 +65,9 @@
/**
* Extracts the text content from a Microsoft document input stream.
*/
- protected abstract void extractText(POIFSFileSystem filesystem, Appendable
appendable)
- throws IOException, TikaException;
+ protected abstract void parse(
+ POIFSFileSystem filesystem, ContentHandler handler, Metadata
metadata)
+ throws IOException, SAXException, TikaException;
private void getMetadata(
POIFSFileSystem filesystem, String name, Metadata metadata) {
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java?rev=628475&r1=628474&r2=628475&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
Sun Feb 17 03:40:20 2008
@@ -18,8 +18,20 @@
import java.io.IOException;
import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.log4j.Logger;
+import org.apache.poi.hdf.extractor.Utils;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Power point parser
@@ -29,20 +41,407 @@
/**
* Name of a PowerPoint document within a POIFS file system
*/
- private static final String POWERPOINT = "PowerPoint Document";
+ private static final String POWERPOINT = "PowerPoint Document";
+
+ /**
+ * Logger instance.
+ */
+ private static final Logger LOG = Logger.getLogger(PowerPointParser.class);
protected String getContentType() {
return "application/vnd.ms-powerpoint";
}
- protected void extractText(POIFSFileSystem filesystem, Appendable builder)
throws IOException {
-
- InputStream stream = filesystem.createDocumentInputStream(POWERPOINT);
+ protected void parse(
+ POIFSFileSystem poifs, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ InputStream stream = poifs.createDocumentInputStream(POWERPOINT);
try {
- new PowerPointExtractor(builder).extract(stream);
+ xhtml.startDocument();
+ xhtml.startElement("p");
+ parse(stream, xhtml);
+ xhtml.startElement("e");
+ xhtml.endDocument();
} finally {
stream.close();
}
+ }
+
+ /**
+ * Reads the internal PowerPoint document stream.
+ */
+ private void parse(InputStream dis, XHTMLContentHandler xhtml) {
+ try {
+ final byte pptdata[] = new byte[dis.available()];
+ dis.read(pptdata, 0, dis.available());
+ int offset = 0;
+ long offsetPD = 0;
+
+ /*
+ * Traverse Bytearray to get CurrentUserEditAtom Call to extract
the
+ * Text in all PlaceHolders to hold PPTClientTextBox objects for
+ * mapping into Slide Objects
+ */
+ Map<Long, TextBox> containerTextBox = new HashMap<Long, TextBox>();
+ // Traverse ByteArray to identiy edit paths of ClientTextBoxes
+ long n = pptdata.length - 20;
+ for (long i = 0; i < n; i++) {
+
+ final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
+ // final long size = LittleEndian.getUInt(pptdata, (int) i +
4);
+
+ if (PPTConstants.PPT_ATOM_USEREDIT == type) {
+ /*
+ * Checking the Record Header (UserEditAtom)
+ */
+ // final long lastSlideID = LittleEndian.getInt(pptdata,
+ // (int) i + 8);
+ // final long version = LittleEndian.getUInt(pptdata, (int)
+ // i + 12);
+ offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
+ offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
+
+ /*
+ * Call to extract ClientTextBox text in each UserEditAtom
+ */
+ extractTextBoxes(containerTextBox, offset, pptdata,
offsetPD);
+ } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
+ // if (LOG.isTraceEnabled()) {
+ // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
+ // }
+ } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
+ // if (LOG.isTraceEnabled()) {
+ // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
+ // }
+ } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
+ // if (LOG.isTraceEnabled()) {
+ // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
+ // }
+ } else {
+ // no action
+ // if (LOG.isTraceEnabled()) {
+ // LOG.trace("type not handled: " + type);
+ // }
+ }
+ }
+
+ List<Slide> slides = extractSlides(offset, pptdata, offsetPD);
+
+ if (slides.size() == 0) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("No slides extracted!");
+ }
+
+ } else {
+ Slide slide = (Slide) slides.get(slides.size() - 1);
+
+ for (TextBox textBox : containerTextBox.values()) {
+ slide.addContent(textBox.getContent());
+ }
+
+ /*
+ * Merging TextBox data with Slide Data Printing the text from
+ * Slides vector object.
+ */
+ for (Slide s : slides) {
+ List scontent = s.getContent();
+ for (int j = 0; j < scontent.size(); j++) {
+ String contentText = scontent.get(j).toString();
+ xhtml.characters(contentText);
+
+ // to avoid concatinated words we add a blank
additional
+ if (contentText.length() > 0
+ && !(contentText.endsWith("\r") || contentText
+ .endsWith("\n"))) {
+ xhtml.characters(" ");
+ }
+ }
+ }
+ }
+ } catch (Throwable ex) {
+ // because of not killing complete crawling all Throwables are
+ // catched.
+
+ LOG.error("processPOIFSReaderEvent", ex);
+ }
+ }
+
+ /**
+ * Extracts the client text boxes of a slide.
+ *
+ * @param containerTextBox
+ * @param offset
+ * @param pptdata
+ * @param offsetPD
+ * @see TextBox
+ */
+ private void extractTextBoxes(
+ Map<Long, TextBox> containerTextBox,
+ int offset, byte[] pptdata, long offsetPD) {
+
+ // To hold temporary data
+ FilteredStringWriter outStream = new FilteredStringWriter();
+
+ TextBox textBox;
+
+ // Traversing the bytearray up to Presist directory position
+ for (int i = offset; i < offsetPD - 20; i++) {
+ try {
+ // Record info
+ // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
+ // Record Type
+ final long recordType = LittleEndian.getUShort(pptdata, i + 2);
+ // Record Size
+ final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
+
+ if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+ /*
+ * Record type is of Drawing Group
+ */
+
+ // Total number of objects
+ // final long objectCount = LittleEndian.getUInt(pptdata,
+ // (int) i +
+ // 8);
+ // currentID = Group ID+number of objects
+ long currentID = LittleEndian.getInt(pptdata, i + 12);
+ currentID = ((int) (currentID / 1024)) * 1024;
+
+ if (currentID == PPTConstants.PPT_MASTERSLIDE) {
+ // Ignore Master Slide objects
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Ignore master slide.");
+ }
+ i++;
+ continue;
+ }
+
+ // Check for the ClientTextBox GroupID existence
+ if (containerTextBox.containsKey(new Long(currentID))) {
+ // If exists get Client Textbox Group
+ textBox = (TextBox) containerTextBox.get(new Long(
+ currentID));
+ textBox.setContent("");
+
+ } else {
+ textBox = new TextBox(currentID);
+ containerTextBox.put(new Long(currentID), textBox);
+ }
+
+ /*
+ * Iterating the bytearray for TextCharAtoms and
+ * TextBytesAtom
+ */
+ if ((offsetPD - 20) != recordSize) {
+ // TODO something wrong? Probably an OLE-Object, which
+ // we ignore.
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("offsetPD - 20=" + (offsetPD - 20)
+ + " recordsize=" + recordSize);
+ }
+ } else {
+ for (int startPos = i + 8; startPos < offsetPD - 20
+ && startPos < recordSize; startPos++) { // &&
+ // startPos
+ // <
+ // recordSize??
+ try {
+
+ // Record info
+ // final long nrinfo =
+ // LittleEndian.getUShort(pptdata, (int) j);
+
+ // Record Type
+ final long ntype = LittleEndian.getUShort(
+ pptdata, startPos + 2);
+
+ // Record size
+ // Note that the size doesn't include the 8
byte
+ // atom header
+ final long nsize = LittleEndian.getUInt(
+ pptdata, startPos + 4);
+
+ if (ntype ==
PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+ /*
+ * Break the loop if next GroupID found
+ */
+ i = startPos - 1;
+ break;
+ } else if (ntype ==
PPTConstants.PPT_ATOM_TEXTBYTE) {
+ // TextByteAtom record
+ outStream = new FilteredStringWriter();
+ long ii = 0;
+ for (ii = startPos + 6; ii <= startPos + 6
+ + nsize; ii++) {
+ // For loop to changed to a function
+ // if ((ii + 2) >= pptdata.length)
+ // break; // FIXME
+ outStream
+ .write((char) (pptdata[(int)
ii + 2]));
+ }
+
+ // Setting the identified text for Current
+ // groupID
+ textBox.setContent(textBox.getContent()
+ + outStream.toString());
+
+ } else if (ntype ==
PPTConstants.PPT_ATOM_TEXTCHAR) {
+ // TextCharAtom record
+
+ final String strTempContent = new String(
+ pptdata, startPos + 6,
+ (int) (nsize) + 2);
+ final byte bytes[] = strTempContent
+ .getBytes();
+ if (true) {
+ outStream = new FilteredStringWriter();
+ for (int ii = 0; ii < bytes.length -
1; ii += 2) {
+ // For loop to changed to a
function
+ outStream
+ .write((char) (pptdata[ii
+ 2]));
+ }
+ textBox.setContent(textBox.getContent()
+ + outStream.toString());
+ } else {
+ // this version is used within POI
+ String text = StringUtil
+ .getFromCompressedUnicode(
+ bytes, 0,
bytes.length);
+ textBox.setContent(textBox.getContent()
+ + text);
+ }
+
+ } else {
+ // ignored
+ // if (LOG.isTraceEnabled()) {
+ // LOG.trace("Ignored atom type: " + type);
+ // }
+ }
+ } catch (Throwable e) {
+
+ LOG.error("extractTextBoxes", e);
+
+ break;
+ }
+ }
+ }
+ } else {
+ // Record type is ignored
+ // if (LOG.isTraceEnabled()) {
+ // LOG.trace("Ignored record type: " + type);
+ // }
+ }
+ } catch (Throwable ee) {
+ LOG.error("extractClientTextBoxes", ee);
+ }
+ }
+ }
+
+ /**
+ * Returns the Powerpoint <code>Slide</code> s of document as vector.
+ *
+ * @param offset
+ * @param pptdata
+ * @param offsetPD
+ * @return Vector of the powerpoint slides. Contains
+ * <code>[EMAIL PROTECTED] Slide Slide}</code>
+ * @see Slide
+ */
+ private List<Slide> extractSlides(
+ long offset, byte[] pptdata, long offsetPD) {
+ int sNum = 0;
+
+ // List of all slides found
+ List<Slide> slides = new ArrayList<Slide>();
+
+ // current slide data
+ Slide currentSlide = null;
+
+ // To store data found in TextCharAtoms and TextBytesAtoms
+ FilteredStringWriter outStream;
+
+ for (long i = offset; i < pptdata.length - 20; i++) {
+ final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
+ final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
+
+ if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
+ /*
+ * TextByteAtom record
+ */
+ outStream = new FilteredStringWriter();
+
+ for (long ii = i + 6; (ii <= i + 6 + atomSize)
+ && (ii + 2 < pptdata.length); ii++) {
+ try {
+ // if(ii+2 >= pptdata.length) break; //FIXME
+ byte value = pptdata[(int) ii + 2];
+ outStream.write(value);
+ } catch (ArrayIndexOutOfBoundsException ex) {
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("size=" + pptdata.length);
+ }
+
+ LOG.error("extractSlides", ex);
+
+ }
+ }
+
+ // Setting the identified text for Current Slide
+ if (currentSlide != null) {
+ currentSlide.addContent(outStream.toString());
+ }
+
+ } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
+ /*
+ * TextCharAtom record
+ */
+ outStream = new FilteredStringWriter();
+ final String strTempContent = new String(pptdata, (int) i + 6,
+ (int) (atomSize) + 2);
+ final byte bytes[] = strTempContent.getBytes();
+
+ for (int ii = 0; ii < bytes.length - 1; ii += 2) {
+ outStream.write(Utils.getUnicodeCharacter(bytes, ii));
+ }
+
+ // Setting the identified text for Current Slide
+ if (currentSlide != null) {
+ currentSlide.addContent(outStream.toString());
+ }
+
+ } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
+ /*
+ * SlidePresistAtom Record
+ */
+ if (sNum != 0) {
+ outStream = new FilteredStringWriter();
+
+ final long slideID = LittleEndian.getUInt(pptdata,
+ (int) i + 20);
+
+ currentSlide = new Slide(slideID);
+ // currentSlide.addContent(outStream.toString());
+ slides.add(currentSlide);
+ }
+ sNum++;
+ } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
+ /*
+ * Diagram records are ignored
+ */
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Drawing Groups are ignored.");
+ }
+ break;
+ } else {
+ // ignored
+ // if (LOG.isTraceEnabled()) {
+ // LOG.trace("Unhandled atomType: " + atomType);
+ // }
+ }
+ }
+
+ return slides;
}
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=628475&r1=628474&r2=628475&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Sun Feb 17 03:40:20 2008
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft;
+import org.apache.log4j.xml.SAXErrorHandler;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Range;
@@ -24,6 +25,11 @@
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.AppendableAdaptor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import java.io.IOException;
@@ -42,8 +48,14 @@
* @param fsys the <code>POIFSFileSystem</code> to read the word document
from.
* @param appendable the <code>Appendable</code> to add the text content
to.
*/
- public void extractText(POIFSFileSystem fsys, Appendable appendable)
- throws IOException, TikaException {
+ public void parse(
+ POIFSFileSystem fsys, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.startElement("p");
+ Appendable appendable = new AppendableAdaptor(xhtml);
+
// load our POIFS document streams.
DocumentEntry headerProps =
(DocumentEntry) fsys.getRoot().getEntry("WordDocument");
@@ -91,11 +103,7 @@
}
}
- // Set POI values to null
- headerProps = null;
- header = null;
- din = null;
- doc = null;
- fsys = null;
+ xhtml.endElement("p");
+ xhtml.endDocument();
}
}