Author: dmeikle
Date: Fri Jan  8 18:15:15 2010
New Revision: 897282

URL: http://svn.apache.org/viewvc?rev=897282&view=rev
Log:
TIKA-103: Addition of POI supported number/date formatting handling within 
XSSFExcelExtractorDecorator

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=897282&r1=897281&r2=897282&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 Fri Jan  8 18:15:15 2010
@@ -17,13 +17,14 @@
 package org.apache.tika.parser.microsoft.ooxml;
 
 import java.io.IOException;
-import java.text.NumberFormat;
 import java.util.Iterator;
 import java.util.Locale;
 
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
 import org.apache.poi.ss.usermodel.Comment;
+import org.apache.poi.ss.usermodel.DataFormatter;
 import org.apache.poi.ss.usermodel.HeaderFooter;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
@@ -37,17 +38,14 @@
 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
 
     /**
-     * Format for rendering numbers in the worksheet. Currently we just
-     * use the platform default formatting.
-     *
-     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-103";>TIKA-103</a>
+     * Internal <code>DataFormatter</code> for formatting Numbers.
      */
-    private final NumberFormat format;
+       private final DataFormatter formatter = new DataFormatter();
+
 
     public XSSFExcelExtractorDecorator(
             XSSFExcelExtractor extractor, Locale locale) {
         super(extractor, 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-        this.format = NumberFormat.getInstance(locale);
     }
 
     /**
@@ -87,8 +85,11 @@
                         xhtml.characters(cell.getRichStringCellValue()
                                 .getString());
                     } else if (type == Cell.CELL_TYPE_NUMERIC) {
-                        xhtml.characters(
-                                format.format(cell.getNumericCellValue()));
+                        CellStyle style = cell.getCellStyle();
+                              xhtml.characters(
+                                       
formatter.formatRawCellContents(cell.getNumericCellValue(),
+                                                                               
                        style.getIndex(),
+                                                                               
                        style.getDataFormatString()));
                     } else {
                         XSSFCell xc = (XSSFCell) cell;
                         String rawValue = xc.getRawValue();

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=897282&r1=897281&r2=897282&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Fri Jan  8 18:15:15 2010
@@ -41,7 +41,7 @@
 
         try {
             parser.parse(input, handler, metadata);
-            
+
             assertEquals(
                     
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                     metadata.get(Metadata.CONTENT_TYPE));
@@ -59,6 +59,73 @@
         }
     }
 
+    public void testExcelFormats() throws Exception {
+        InputStream input = OOXMLParserTest.class
+                .getResourceAsStream("/test-documents/testEXCEL-formats.xlsx");
+
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+        // TODO: should auto-detect without the resource name
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL-formats.xlsx");
+        ContentHandler handler = new BodyContentHandler();
+
+        try {
+            parser.parse(input, handler, metadata);
+
+            assertEquals(
+                    
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                    metadata.get(Metadata.CONTENT_TYPE));
+
+            String content = handler.toString();
+
+            // Number #,##0.00
+            assertTrue(content.contains("1,599.99"));
+            assertTrue(content.contains("-1,599.99"));
+
+            // Currency $#,##0.00;[Red]($#,##0.00)
+            assertTrue(content.contains("$1,599.99"));
+            assertTrue(content.contains("($1,599.99)"));
+
+            // Scientific 0.00E+00
+            assertTrue(content.contains("1.98E08"));
+            assertTrue(content.contains("-1.98E08"));
+
+            // Percentage
+            assertTrue(content.contains("2%"));
+            assertTrue(content.contains("2.50%"));
+
+            // Time Format: h:mm
+            assertTrue(content.contains("6:15"));
+            assertTrue(content.contains("18:15"));
+
+            // Date Format: d-mmm-yy
+            assertTrue(content.contains("17-May-07"));
+
+            // Below assertions represent outstanding formatting issues to be 
addressed
+            // they are included to allow the issues to be progressed with the 
Apache POI
+            // team - See TIKA-103.
+
+            
/*************************************************************************
+            // Date Format: m/d/yy
+            assertTrue(content.contains("03/10/2009"));
+
+            // Date/Time Format
+            assertTrue(content.contains("19/01/2008 04:35"));
+
+            // Custom Number (0 "dollars and" .00 "cents")
+            assertTrue(content.contains("19 dollars and .99 cents"));
+
+            // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+            assertTrue(content.contains("At 4:20 AM on Thursday May 17, 
2007"));
+
+            // Fraction (2.5): # ?/?
+            assertTrue(content.contains("2 1 / 2"));
+            
**************************************************************************/
+        } finally {
+            input.close();
+        }
+    }
+
     public void testPowerPoint() throws Exception {
         InputStream input = OOXMLParserTest.class
                 .getResourceAsStream("/test-documents/testPPT.pptx");
@@ -71,7 +138,7 @@
 
         try {
             parser.parse(input, handler, metadata);
-            
+
             assertEquals(
                     
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
                     metadata.get(Metadata.CONTENT_TYPE));
@@ -98,7 +165,7 @@
 
         try {
             parser.parse(input, handler, metadata);
-            
+
             assertEquals(
                     
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                     metadata.get(Metadata.CONTENT_TYPE));


Reply via email to