Author: jukka
Date: Sun Jan 20 06:46:17 2008
New Revision: 613566
URL: http://svn.apache.org/viewvc?rev=613566&view=rev
Log:
TIKA-105: Excel parser implementation based on POI's Event API
- Replaced ExcelParser with ExcelEventParser
- Use a setter for listenForAllRecords
(JavaBean properties are more flexible
than constructor arguments)
- Use debug logging for all output
- Removed some of the explicit log.isDebugEnabled() checks
(simplicity over insignificant performance gains)
- Inlined the trivial debug(Record) method
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
- copied, changed from r613561,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
(from r613561,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java&r1=613561&r2=613566&rev=613566&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelEventParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
Sun Jan 20 06:46:17 2008
@@ -51,10 +51,6 @@
* Excel parser implementation which uses POI's Event API
* to handle the contents of a Workbook.
* <p>
- * This is an alternative implementation to Tika's
- * [EMAIL PROTECTED] ExcelParser} implementation which uses POI's
- * <code>HSSFWorkbook</code> to parse excel files.
- * <p>
* The Event API uses a much smaller memory footprint than
* <code>HSSFWorkbook</code> when processing excel files
* but at the cost of more complexity.
@@ -68,40 +64,40 @@
* @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
* POI Event API How To</a>
*/
-public class ExcelEventParser extends OfficeParser implements Serializable {
+public class ExcelParser extends OfficeParser implements Serializable {
/** Logging instance */
- private static Log log = LogFactory.getLog(ExcelEventParser.class);
+ private static Log log = LogFactory.getLog(ExcelParser.class);
/**
* <code>true</code> if the HSSFListener should be registered
- * to listen for all records or <code>false</code> if the listener
- * should be configured to only receive specified records.
+ * to listen for all records or <code>false</code> (the default)
+ * if the listener should be configured to only receive specified
+ * records.
*/
- private final boolean listenForAllRecords;
+ private boolean listenForAllRecords = false;
/**
- * Create an instance which only listens for the specified
- * records (i.e. <code>listenForAllRecords</code> is
- * <code>false</code>).
+ * Returns <code>true</code> if this parser is configured to listen
+ * for all records instead of just the specified few.
*/
- public ExcelEventParser() {
- this(false);
+ public boolean isListenForAllRecords() {
+ return listenForAllRecords;
}
/**
- * Create an instance specifying whether to listen for all
+ * Specifies whether this parser should to listen for all
* records or just for the specified few.
* <p>
- * <strong>Note</strong> This constructor is intended primarily
- * for testing and debugging - under normal operation
- * <code>listenForAllRecords</code> should be <code>false</code>.
+ * <strong>Note:</strong> Under normal operation this setting should
+ * be <code>false</code> (the default), but you can experiment with
+ * this setting for testing and debugging purposes.
*
* @param listenForAllRecords <code>true</code> if the HSSFListener
* should be registered to listen for all records or <code>false</code>
* if the listener should be configured to only receive specified records.
*/
- public ExcelEventParser(boolean listenForAllRecords) {
+ public void setListenForAllRecords(boolean listenForAllRecords) {
this.listenForAllRecords = listenForAllRecords;
}
@@ -125,10 +121,7 @@
*/
protected void extractText(final POIFSFileSystem filesystem,
final Appendable appendable) throws IOException {
-
- if (log.isInfoEnabled()) {
- log.info("Starting listenForAllRecords=" + listenForAllRecords);
- }
+ log.debug("Starting listenForAllRecords=" + listenForAllRecords);
// Set up listener and register the records we want to process
TikaHSSFListener listener = new TikaHSSFListener(appendable);
@@ -156,9 +149,7 @@
HSSFEventFactory eventFactory = new HSSFEventFactory();
eventFactory.processEvents(hssfRequest, documentInputStream);
- if (log.isInfoEnabled()) {
- log.info("Processed " + listener.getRecordCount() + " records");
- }
+ log.debug("Processed " + listener.getRecordCount() + " records");
}
// ======================================================================
@@ -169,7 +160,7 @@
private static class TikaHSSFListener implements HSSFListener,
Serializable {
/** Logging instance */
- private static Log log = LogFactory.getLog(ExcelEventParser.class);
+ private static Log log = LogFactory.getLog(ExcelParser.class);
private final Appendable appendable;
private int recordCount;
@@ -230,23 +221,20 @@
if (currentSheetIndex < sheetNames.size()) {
currentSheetName =
sheetNames.get(currentSheetIndex);
}
- if (log.isDebugEnabled()) {
- debug(record, ".Worksheet[" + currentSheetIndex
- + "], Name=[" + currentSheetName +
"]");
- }
+ debug(record,
+ ".Worksheet[" + currentSheetIndex
+ + "], Name=[" + currentSheetName + "]");
addText(currentSheetName);
break;
default:
- if (log.isDebugEnabled()) {
- debug(record, "[" + bofRecordType + "]");
- }
+ debug(record, "[" + bofRecordType + "]");
break;
}
break;
/* BOFRecord: indicates end of workbook, worksheet etc.
records */
case EOFRecord.sid:
- debug(record);
+ debug(record, "");
bofRecordType = 0;
break;
@@ -254,9 +242,7 @@
case DateWindow1904Record.sid:
DateWindow1904Record dw1904Rec =
(DateWindow1904Record)record;
use1904windowing = (dw1904Rec.getWindowing() == 1);
- if (log.isDebugEnabled()) {
- debug(record, "[" + use1904windowing + "]");
- }
+ debug(record, "[" + use1904windowing + "]");
break;
/* CountryRecord: holds all the strings for LabelSSTRecords */
@@ -264,16 +250,15 @@
CountryRecord countryRecord = (CountryRecord)record;
defualtCountry = countryRecord.getDefaultCountry();
currentCountry = countryRecord.getCurrentCountry();
- if (log.isDebugEnabled()) {
- debug(record, " default=[" + defualtCountry
- + "], current=[" + currentCountry + "]");
- }
+ debug(record,
+ " default=[" + defualtCountry
+ + "], current=[" + currentCountry + "]");
break;
/* SSTRecord: holds all the strings for LabelSSTRecords */
case SSTRecord.sid:
sstRecord = (SSTRecord)record;
- debug(record);
+ debug(record, "");
break;
/* BoundSheetRecord: Worksheet index record */
@@ -281,10 +266,9 @@
BoundSheetRecord boundSheetRecord =
(BoundSheetRecord)record;
String sheetName = boundSheetRecord.getSheetname();
sheetNames.add(sheetName);
- if (log.isDebugEnabled()) {
- debug(record, "[" + sheetNames.size()
- + "], Name=[" + sheetName + "]");
- }
+ debug(record,
+ "[" + sheetNames.size()
+ + "], Name=[" + sheetName + "]");
break;
/* FormatRecord */
@@ -293,9 +277,7 @@
String dataFormat = formatRecord.getFormatString();
short formatIdx = formatRecord.getIndexCode();
formats.put(formatIdx, dataFormat);
- if (log.isDebugEnabled()) {
- debug(record, "[" + formatIdx + "]=[" + dataFormat +
"]");
- }
+ debug(record, "[" + formatIdx + "]=[" + dataFormat + "]");
break;
/* ExtendedFormatRecord */
@@ -305,10 +287,9 @@
short dataFormatIdx = xFormatRecord.getFormatIndex();
if (dataFormatIdx > 0) {
extendedFormats.put(currentXFormatIdx,
dataFormatIdx);
- if (log.isDebugEnabled()) {
- debug(record, "[" + currentXFormatIdx
- + "]=FormatRecord[" + dataFormatIdx +
"]");
- }
+ debug(record,
+ "[" + currentXFormatIdx
+ + "]=FormatRecord[" + dataFormatIdx + "]");
}
}
currentXFormatIdx++;
@@ -319,7 +300,7 @@
&& record instanceof CellValueRecordInterface) {
processCellValue(sid,
(CellValueRecordInterface)record);
} else {
- debug(record);
+ debug(record, "");
}
break;
}
@@ -453,15 +434,6 @@
}
}
return text;
- }
-
- /**
- * Record debugging.
- *
- * @param record The Record
- */
- private void debug(Record record) {
- debug(record, "");
}
/**