Author: jukka
Date: Tue Mar 24 10:56:33 2009
New Revision: 757719
URL: http://svn.apache.org/viewvc?rev=757719&view=rev
Log:
TIKA-211: memory issue in ExcelExtractor
Use a single NumberFormat instance per parse() call to dramatically reduce
memory usage when parsing large worksheets.
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=757719&r1=757718&r2=757719&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Tue Mar 24 10:56:33 2009
@@ -19,6 +19,7 @@
import java.awt.Point;
import java.io.IOException;
import java.io.Serializable;
+import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
@@ -190,7 +191,15 @@
private SortedMap<Point, Cell> currentSheet = null;
/**
- * Contstruct a new listener instance outputting parsed data to
+ * Format for rendering numbers in the worksheet. Currently we just
+ * use the platform default formatting.
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
+ */
+ private final NumberFormat format = NumberFormat.getInstance();
+
+ /**
+ * Construct a new listener instance outputting parsed data to
* the specified XHTML content handler.
*
* @param handler Destination to write the parsed output to
@@ -254,7 +263,7 @@
case FormulaRecord.sid: // Cell value from a formula
FormulaRecord formula = (FormulaRecord) record;
- addCell(record, new NumberCell(formula.getValue()));
+ addCell(record, new NumberCell(formula.getValue(), format));
break;
case LabelRecord.sid: // strings stored directly in the cell
@@ -270,12 +279,12 @@
case NumberRecord.sid: // Contains a numeric cell value
NumberRecord number = (NumberRecord) record;
- addCell(record, new NumberCell(number.getValue()));
+ addCell(record, new NumberCell(number.getValue(), format));
break;
case RKRecord.sid: // Excel internal number record
RKRecord rk = (RKRecord) record;
- addCell(record, new NumberCell(rk.getRKNumber()));
+ addCell(record, new NumberCell(rk.getRKNumber(), format));
break;
case HyperlinkRecord.sid: // holds a URL associated with a cell
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=757719&r1=757718&r2=757719&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
Tue Mar 24 10:56:33 2009
@@ -35,10 +35,6 @@
this.format = format;
}
- public NumberCell(double number) {
- this(number, NumberFormat.getInstance());
- }
-
public void render(XHTMLContentHandler handler) throws SAXException {
handler.characters(format.format(number));
}