Author: dmeikle
Date: Fri Jan 8 18:15:15 2010
New Revision: 897282
URL: http://svn.apache.org/viewvc?rev=897282&view=rev
Log:
TIKA-103: Addition of POI supported number/date formatting handling within
XSSFExcelExtractorDecorator
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=897282&r1=897281&r2=897282&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
Fri Jan 8 18:15:15 2010
@@ -17,13 +17,14 @@
package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
-import java.text.NumberFormat;
import java.util.Iterator;
import java.util.Locale;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.Comment;
+import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
@@ -37,17 +38,14 @@
public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
/**
- * Format for rendering numbers in the worksheet. Currently we just
- * use the platform default formatting.
- *
- * @see <a
href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
+ * Internal <code>DataFormatter</code> for formatting Numbers.
*/
- private final NumberFormat format;
+ private final DataFormatter formatter = new DataFormatter();
+
public XSSFExcelExtractorDecorator(
XSSFExcelExtractor extractor, Locale locale) {
super(extractor,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- this.format = NumberFormat.getInstance(locale);
}
/**
@@ -87,8 +85,11 @@
xhtml.characters(cell.getRichStringCellValue()
.getString());
} else if (type == Cell.CELL_TYPE_NUMERIC) {
- xhtml.characters(
- format.format(cell.getNumericCellValue()));
+ CellStyle style = cell.getCellStyle();
+ xhtml.characters(
+
formatter.formatRawCellContents(cell.getNumericCellValue(),
+
style.getIndex(),
+
style.getDataFormatString()));
} else {
XSSFCell xc = (XSSFCell) cell;
String rawValue = xc.getRawValue();
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=897282&r1=897281&r2=897282&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Fri Jan 8 18:15:15 2010
@@ -41,7 +41,7 @@
try {
parser.parse(input, handler, metadata);
-
+
assertEquals(
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
metadata.get(Metadata.CONTENT_TYPE));
@@ -59,6 +59,73 @@
}
}
+ public void testExcelFormats() throws Exception {
+ InputStream input = OOXMLParserTest.class
+ .getResourceAsStream("/test-documents/testEXCEL-formats.xlsx");
+
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ // TODO: should auto-detect without the resource name
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL-formats.xlsx");
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(input, handler, metadata);
+
+ assertEquals(
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+
+ // Number #,##0.00
+ assertTrue(content.contains("1,599.99"));
+ assertTrue(content.contains("-1,599.99"));
+
+ // Currency $#,##0.00;[Red]($#,##0.00)
+ assertTrue(content.contains("$1,599.99"));
+ assertTrue(content.contains("($1,599.99)"));
+
+ // Scientific 0.00E+00
+ assertTrue(content.contains("1.98E08"));
+ assertTrue(content.contains("-1.98E08"));
+
+ // Percentage
+ assertTrue(content.contains("2%"));
+ assertTrue(content.contains("2.50%"));
+
+ // Time Format: h:mm
+ assertTrue(content.contains("6:15"));
+ assertTrue(content.contains("18:15"));
+
+ // Date Format: d-mmm-yy
+ assertTrue(content.contains("17-May-07"));
+
+ // Below assertions represent outstanding formatting issues to be
addressed
+ // they are included to allow the issues to be progressed with the
Apache POI
+ // team - See TIKA-103.
+
+
/*************************************************************************
+ // Date Format: m/d/yy
+ assertTrue(content.contains("03/10/2009"));
+
+ // Date/Time Format
+ assertTrue(content.contains("19/01/2008 04:35"));
+
+ // Custom Number (0 "dollars and" .00 "cents")
+ assertTrue(content.contains("19 dollars and .99 cents"));
+
+ // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+ assertTrue(content.contains("At 4:20 AM on Thursday May 17,
2007"));
+
+ // Fraction (2.5): # ?/?
+ assertTrue(content.contains("2 1 / 2"));
+
**************************************************************************/
+ } finally {
+ input.close();
+ }
+ }
+
public void testPowerPoint() throws Exception {
InputStream input = OOXMLParserTest.class
.getResourceAsStream("/test-documents/testPPT.pptx");
@@ -71,7 +138,7 @@
try {
parser.parse(input, handler, metadata);
-
+
assertEquals(
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
metadata.get(Metadata.CONTENT_TYPE));
@@ -98,7 +165,7 @@
try {
parser.parse(input, handler, metadata);
-
+
assertEquals(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
metadata.get(Metadata.CONTENT_TYPE));