I have a 3MB xls, with 26 sheets. Half have a matrix of approx 1100xP and the others have approx 1000xE.

Using the v0.9 ExcelExtractor, I left it extracting text on a reasonably powerful machine @ 100% CPU (Java 1.6). Just over 4 hours later it was still going!!

I finally gave up waiting and stopped it.

Having changed the extractor to use StringBuffer, it takes 3 seconds to extract the 1088233 characters of text. Changes to extractText() below if wanted.

Antony

  protected String extractText(InputStream input) throws Exception {

    String resultText = "";
    HSSFWorkbook wb = new HSSFWorkbook(input);
    if (wb == null) {
      return resultText;
    }

    HSSFSheet sheet;
    HSSFRow row;
    HSSFCell cell;
    int sNum = 0;
    int rNum = 0;
    int cNum = 0;

    sNum = wb.getNumberOfSheets();

    //  Allow 4K per sheet - seems a reasonable start
    StringBuffer sb = new StringBuffer(4096 * sNum);
    for (int i=0; i<sNum; i++) {
      if ((sheet = wb.getSheetAt(i)) == null) {
        continue;
      }
      rNum = sheet.getLastRowNum();
      for (int j=0; j<=rNum; j++) {
        if ((row = sheet.getRow(j)) == null){
          continue;
        }
        cNum = row.getLastCellNum();

        for (int k=0; k<cNum; k++) {
          if ((cell = row.getCell((short) k)) != null) {
            /*if(HSSFDateUtil.isCellDateFormatted(cell) == true) {
                resultText += cell.getDateCellValue().toString() + " ";
              } else
             */
            if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
                sb.append(cell.getStringCellValue());
                sb.append(' ');
            } else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
              Double d = new Double(cell.getNumericCellValue());
              sb.append(d.toString());
              sb.append(' ');
            }
            /* else if(cell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
                 resultText += cell.getCellFormula() + " ";
               }
             */
          }
        }
      }
    }
    return sb.toString();

Reply via email to