Author: nick
Date: Sun Nov 30 14:22:06 2014
New Revision: 1642548

URL: http://svn.apache.org/r1642548
Log:
Begin adding Excel 5 support to OldExcelExtractor for TIKA-1490

Added:
    poi/trunk/test-data/spreadsheet/testEXCEL_5.xls   (with props)
    poi/trunk/test-data/spreadsheet/testEXCEL_95.xls   (with props)
Modified:
    poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
    poi/trunk/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
    
poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java

Modified: 
poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java?rev=1642548&r1=1642547&r2=1642548&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java 
(original)
+++ poi/trunk/src/java/org/apache/poi/hssf/extractor/OldExcelExtractor.java Sun 
Nov 30 14:22:06 2014
@@ -17,6 +17,8 @@
 
 package org.apache.poi.hssf.extractor;
 
+import java.io.BufferedInputStream;
+import java.io.Closeable;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
@@ -28,11 +30,15 @@ import org.apache.poi.hssf.record.OldLab
 import org.apache.poi.hssf.record.OldStringRecord;
 import org.apache.poi.hssf.record.RKRecord;
 import org.apache.poi.hssf.record.RecordInputStream;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.ss.usermodel.Cell;
 
 /**
- * A text extractor for very old (pre-OLE2) Excel files,
- *  such as Excel 4 files.
+ * A text extractor for old Excel files, which are too old for
+ *  HSSFWorkbook to handle. This includes Excel 95, and very old 
+ *  (pre-OLE2) Excel files, such as Excel 4 files.
  * <p>
  * Returns much (but not all) of the textual content of the file, 
  *  suitable for indexing by something like Apache Lucene, or used
@@ -40,13 +46,47 @@ import org.apache.poi.ss.usermodel.Cell;
  * </p>
  */
 public class OldExcelExtractor {
-    private InputStream input;
+    private RecordInputStream ris;
+    private Closeable input;
 
-    public OldExcelExtractor(InputStream input) {
-        this.input = input;
+    public OldExcelExtractor(InputStream input) throws IOException {
+        BufferedInputStream bstream = new BufferedInputStream(input, 8);
+        if (NPOIFSFileSystem.hasPOIFSHeader(bstream)) {
+            open(new NPOIFSFileSystem(bstream));
+        } else {
+            open(bstream);
+        }
     }
     public OldExcelExtractor(File f) throws IOException {
-        this.input = new FileInputStream(f);
+        InputStream input = new FileInputStream(f);
+        if (NPOIFSFileSystem.hasPOIFSHeader(input)) {
+            open(new NPOIFSFileSystem(f));
+        } else {
+            open(input);
+        }
+    }
+    public OldExcelExtractor(NPOIFSFileSystem fs) throws IOException {
+        open(fs);
+    }
+    public OldExcelExtractor(DirectoryNode directory) throws IOException {
+        open(directory);
+    }
+
+    private void open(InputStream biffStream) {
+        input = biffStream;
+        ris = new RecordInputStream(biffStream);
+    }
+    private void open(NPOIFSFileSystem fs) throws IOException {
+        input = fs;
+        open(fs.getRoot());
+    }
+    private void open(DirectoryNode directory) throws IOException {
+        DocumentNode book = (DocumentNode)directory.getEntry("Book");
+        if (book == null) {
+            throw new IOException("No Excel 5/95 Book stream found");
+        }
+        
+        ris = new RecordInputStream(directory.createDocumentInputStream(book));
     }
 
     public static void main(String[] args) throws Exception {
@@ -66,7 +106,6 @@ public class OldExcelExtractor {
     public String getText() {
         StringBuffer text = new StringBuffer();
 
-        RecordInputStream ris = new RecordInputStream(input);
         while (ris.hasNextRecord()) {
             int sid = ris.getNextSid();
             ris.nextRecord();
@@ -108,6 +147,14 @@ public class OldExcelExtractor {
                     ris.readFully(new byte[ris.remaining()]);
             }
         }
+        
+        if (input != null) {
+            try {
+                input.close();
+            } catch (IOException e) {}
+            input = null;
+        }
+        ris = null;
 
         return text.toString();
     }

Modified: poi/trunk/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java?rev=1642548&r1=1642547&r2=1642548&view=diff
==============================================================================
--- poi/trunk/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java 
(original)
+++ poi/trunk/src/testcases/org/apache/poi/hssf/dev/TestBiffViewer.java Sun Nov 
30 14:22:06 2014
@@ -38,7 +38,9 @@ public class TestBiffViewer extends Base
                SILENT_EXCLUDED.add("46904.xls");
         SILENT_EXCLUDED.add("35897-type4.xls"); // unsupported crypto api 
header 
                SILENT_EXCLUDED.add("xor-encryption-abc.xls"); // unsupported 
XOR-encryption
-        SILENT_EXCLUDED.add("testEXCEL_4.xls"); // Biff 4 / Excel 4, pre-OLE2
+        SILENT_EXCLUDED.add("testEXCEL_4.xls");  // Biff 4 / Excel 4, pre-OLE2
+        SILENT_EXCLUDED.add("testEXCEL_5.xls");  // Biff 5 / Excel 5
+        SILENT_EXCLUDED.add("testEXCEL_95.xls"); // Biff 5 / Excel 95
        }
 
        @Override

Modified: 
poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java?rev=1642548&r1=1642547&r2=1642548&view=diff
==============================================================================
--- 
poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
 (original)
+++ 
poi/trunk/src/testcases/org/apache/poi/hssf/extractor/TestOldExcelExtractor.java
 Sun Nov 30 14:22:06 2014
@@ -24,7 +24,8 @@ import junit.framework.TestCase;
 import org.apache.poi.hssf.HSSFTestDataSamples;
 
 /**
- * Unit tests for the Excel 4 (and older) text extractor
+ * Unit tests for the Excel 5/95 and Excel 4 (and older) text 
+ *  extractor
  */
 public final class TestOldExcelExtractor extends TestCase {
     private static OldExcelExtractor createExtractor(String sampleFileName) {
@@ -37,7 +38,7 @@ public final class TestOldExcelExtractor
         }
     }
 
-    public void testSimple() {
+    public void testSimpleExcel4() {
         OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
 
         // Check we can call getText without error
@@ -51,6 +52,22 @@ public final class TestOldExcelExtractor
         assertTrue(text, text.contains("11"));
         assertTrue(text, text.contains("784"));
     }
+    public void DISABLEDtestSimpleExcel5() {
+        for (String ver : new String[] {"5", "95"}) {
+            OldExcelExtractor extractor = 
createExtractor("testEXCEL_"+ver+".xls");
+    
+            // Check we can call getText without error
+            String text = extractor.getText();
+    
+            // Check we find a few words we expect in there
+            assertTrue(text, text.contains("Sample Excel"));
+            assertTrue(text, text.contains("Written and saved"));
+            
+            // Check we find a few numbers we expect in there
+            assertTrue(text, text.contains("15"));
+            assertTrue(text, text.contains("169"));
+        }
+    }
 
     public void testStrings() {
         OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
@@ -71,7 +88,7 @@ public final class TestOldExcelExtractor
         // TODO Find some then test
     }
 
-    public void testFormattedNumbers() {
+    public void testFormattedNumbersExcel4() {
         OldExcelExtractor extractor = createExtractor("testEXCEL_4.xls");
         String text = extractor.getText();
 
@@ -88,4 +105,17 @@ public final class TestOldExcelExtractor
 //      assertTrue(text, text.contains("55,624"));
 //      assertTrue(text, text.contains("11,743,477"));
     }
+    public void DISABLEDtestFormattedNumbersExcel5() {
+        for (String ver : new String[] {"5", "95"}) {
+            OldExcelExtractor extractor = 
createExtractor("testEXCEL_"+ver+".xls");
+            String text = extractor.getText();
+            
+            // Simple numbers
+            assertTrue(text, text.contains("1"));
+            
+            // Numbers which come from formulas
+            assertTrue(text, text.contains("13"));
+            assertTrue(text, text.contains("169"));
+        }
+    }
 }

Added: poi/trunk/test-data/spreadsheet/testEXCEL_5.xls
URL: 
http://svn.apache.org/viewvc/poi/trunk/test-data/spreadsheet/testEXCEL_5.xls?rev=1642548&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/spreadsheet/testEXCEL_5.xls
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: poi/trunk/test-data/spreadsheet/testEXCEL_95.xls
URL: 
http://svn.apache.org/viewvc/poi/trunk/test-data/spreadsheet/testEXCEL_95.xls?rev=1642548&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/spreadsheet/testEXCEL_95.xls
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@poi.apache.org
For additional commands, e-mail: commits-h...@poi.apache.org

Reply via email to