Author: jukka
Date: Tue Mar 24 10:56:33 2009
New Revision: 757719

URL: http://svn.apache.org/viewvc?rev=757719&view=rev
Log:
TIKA-211: memory issue in ExcelExtractor

Use a single NumberFormat instance per parse() call to dramatically reduce 
memory usage when parsing large worksheets.

Modified:
    
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=757719&r1=757718&r2=757719&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 (original)
+++ 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 Tue Mar 24 10:56:33 2009
@@ -19,6 +19,7 @@
 import java.awt.Point;
 import java.io.IOException;
 import java.io.Serializable;
+import java.text.NumberFormat;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
@@ -190,7 +191,15 @@
         private SortedMap<Point, Cell> currentSheet = null;
 
         /**
-         * Contstruct a new listener instance outputting parsed data to
+         * Format for rendering numbers in the worksheet. Currently we just
+         * use the platform default formatting.
+         *
+         * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-103";>TIKA-103</a>
+         */
+        private final NumberFormat format = NumberFormat.getInstance();
+
+        /**
+         * Construct a new listener instance outputting parsed data to
          * the specified XHTML content handler.
          *
          * @param handler Destination to write the parsed output to
@@ -254,7 +263,7 @@
 
             case FormulaRecord.sid: // Cell value from a formula
                 FormulaRecord formula = (FormulaRecord) record;
-                addCell(record, new NumberCell(formula.getValue()));
+                addCell(record, new NumberCell(formula.getValue(), format));
                 break;
 
             case LabelRecord.sid: // strings stored directly in the cell
@@ -270,12 +279,12 @@
 
             case NumberRecord.sid: // Contains a numeric cell value
                 NumberRecord number = (NumberRecord) record;
-                addCell(record, new NumberCell(number.getValue()));
+                addCell(record, new NumberCell(number.getValue(), format));
                 break;
 
             case RKRecord.sid: // Excel internal number record
                 RKRecord rk = (RKRecord) record;
-                addCell(record, new NumberCell(rk.getRKNumber()));
+                addCell(record, new NumberCell(rk.getRKNumber(), format));
                 break;
 
             case HyperlinkRecord.sid: // holds a URL associated with a cell

Modified: 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=757719&r1=757718&r2=757719&view=diff
==============================================================================
--- 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
 (original)
+++ 
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
 Tue Mar 24 10:56:33 2009
@@ -35,10 +35,6 @@
         this.format = format;
     }
 
-    public NumberCell(double number) {
-        this(number, NumberFormat.getInstance());
-    }
-
     public void render(XHTMLContentHandler handler) throws SAXException {
         handler.characters(format.format(number));
     }


Reply via email to