Author: jukka
Date: Wed Mar 26 12:51:15 2008
New Revision: 641521
URL: http://svn.apache.org/viewvc?rev=641521&view=rev
Log:
TIKA-132: Refactor Excel extractor to parse per sheet and add hyperlink support
- Improved formatting of internalProcessRecord
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=641521&r1=641520&r2=641521&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Wed Mar 26 12:51:15 2008
@@ -40,6 +40,7 @@
import org.apache.poi.hssf.record.ExtendedFormatRecord;
import org.apache.poi.hssf.record.FormatRecord;
import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.UnicodeString;
//import org.apache.poi.hssf.record.HyperlinkRecord; // FIXME - requires POI
release
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
@@ -213,89 +214,69 @@
private void internalProcessRecord(Record record) throws SAXException {
switch (record.getSid()) {
+ case BOFRecord.sid: // start of workbook, worksheet etc. records
+ BOFRecord bof = (BOFRecord) record;
+ if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
+ currentSheetIndex = -1;
+ } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
+ currentSheetIndex++;
+ currentSheet.clear();
+ insideWorksheet = true;
+ }
+ break;
- /* BOFRecord: indicates start of workbook, worksheet etc.
records */
- case BOFRecord.sid:
- switch (((BOFRecord) record).getType()) {
- case BOFRecord.TYPE_WORKBOOK:
- currentSheetIndex = -1;
- break;
- case BOFRecord.TYPE_WORKSHEET:
- currentSheetIndex++;
- currentSheet.clear();
- insideWorksheet = true;
- break;
- }
- break;
-
- /* EOFRecord: indicates end of workbook, worksheet etc.
records */
- case EOFRecord.sid:
- // ignore empty sheets
- if (insideWorksheet && !currentSheet.isEmpty()) {
- processSheet();
- }
- insideWorksheet = false;
- break;
+ case EOFRecord.sid: // end of workbook, worksheet etc. records
+ if (insideWorksheet && !currentSheet.isEmpty()) {
+ processSheet();
+ }
+ insideWorksheet = false;
+ break;
- /* SSTRecord: holds all the strings for LabelSSTRecords */
- case SSTRecord.sid:
- sstRecord = (SSTRecord)record;
- break;
-
- /* BoundSheetRecord: Worksheet index record */
- case BoundSheetRecord.sid:
- BoundSheetRecord boundSheetRecord =
(BoundSheetRecord)record;
- String sheetName = boundSheetRecord.getSheetname();
- sheetNames.add(sheetName);
- break;
-
- // FIXME - requires POI release
- ///* HyperlinkRecord: holds a URL associated with a cell */
- //case HyperlinkRecord.sid:
- // HyperlinkRecord hyperlinkRecord =
(HyperlinkRecord)record;
- // if (insideWorksheet) {
- // int row = hyperlinkRecord.getFirstRow();
- // short column = hyperlinkRecord.getFirstColumn();
- // Point point = new Point(column, row);
- // Cell cell = currentSheet.get(point);
- // if (cell != null) {
- // cell = new LinkedCell(cell,
hyperlinkRecord.getAddress());
- // currentSheet.put(point, cell);
- // }
- // }
- // break;
-
- /* FormulaRecord: Cell value from a formula */
- case FormulaRecord.sid:
- FormulaRecord formula = (FormulaRecord) record;
- addCell(record, new NumberCell(formula.getValue()));
- break;
-
- /* LabelRecord: strings stored directly in the cell */
- case LabelRecord.sid:
- LabelRecord label = (LabelRecord) record;
- addCell(record, getTextCell(label.getValue()));
- break;
-
- /* LabelSSTRecord: Ref. a string in the shared string table */
- case LabelSSTRecord.sid:
- LabelSSTRecord labelSSTRecord = (LabelSSTRecord) record;
- int sstIndex = labelSSTRecord.getSSTIndex();
- String sstLabel =
sstRecord.getString(sstIndex).getString();
- addCell(record, getTextCell(sstLabel));
- break;
-
- /* NumberRecord: Contains a numeric cell value */
- case NumberRecord.sid:
- NumberRecord number = (NumberRecord) record;
- addCell(record, new NumberCell(number.getValue()));
- break;
-
- /* RKRecord: Excel internal number record */
- case RKRecord.sid:
- RKRecord rk = (RKRecord) record;
- addCell(record, new NumberCell(rk.getRKNumber()));
- break;
+ case BoundSheetRecord.sid: // Worksheet index record
+ BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
+ sheetNames.add(boundSheetRecord.getSheetname());
+ break;
+
+ case SSTRecord.sid: // holds all the strings for LabelSSTRecords
+ sstRecord = (SSTRecord) record;
+ break;
+
+ case FormulaRecord.sid: // Cell value from a formula
+ FormulaRecord formula = (FormulaRecord) record;
+ addCell(record, new NumberCell(formula.getValue()));
+ break;
+
+ case LabelRecord.sid: // strings stored directly in the cell
+ LabelRecord label = (LabelRecord) record;
+ addTextCell(record, label.getValue());
+ break;
+
+ case LabelSSTRecord.sid: // Ref. a string in the shared string
table
+ LabelSSTRecord sst = (LabelSSTRecord) record;
+ UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
+ addTextCell(record, unicode.getString());
+ break;
+
+ case NumberRecord.sid: // Contains a numeric cell value
+ NumberRecord number = (NumberRecord) record;
+ addCell(record, new NumberCell(number.getValue()));
+ break;
+
+ case RKRecord.sid: // Excel internal number record
+ RKRecord rk = (RKRecord) record;
+ addCell(record, new NumberCell(rk.getRKNumber()));
+ break;
+
+ // FIXME - requires POI release
+ // case HyperlinkRecord.sid: // holds a URL associated with a cell
+ // HyperlinkRecord link = (HyperlinkRecord) record;
+ // Point point =
+ // new Point(link.getFirstColumn(), link.getFirstRow());
+ // Cell cell = currentSheet.get(point);
+ // if (cell != null) {
+ // addCell(record, new LinkedCell(cell,
link.getAddress()));
+ // }
+ // break;
}
}
@@ -320,20 +301,19 @@
}
/**
- * Returns a text cell with the given text comment. The given text
+ * Adds a text cell with the given text comment. The given text
* is trimmed, and ignored if <code>null</code> or empty.
*
+ * @param record record that holds the text value
* @param text text content, may be <code>null</code>
- * @return text cell, or <code>null</code>
*/
- private Cell getTextCell(String text) {
+ private void addTextCell(Record record, String text) {
if (text != null) {
text = text.trim();
if (text.length() > 0) {
- return new TextCell(text);
+ addCell(record, new TextCell(text));
}
}
- return null;
}
/**