This is an automated email from the ASF dual-hosted git repository. arjansh pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/metamodel.git
commit a58159ab60c113745036075de117bd9044b02bea Author: Gerard Dellemann <g.dellem...@quadient.com> AuthorDate: Wed Dec 4 17:19:54 2019 +0100 MM-82 Detect Column Types --- excel/pom.xml | 2 +- .../excel/DefaultSpreadsheetReaderDelegate.java | 214 +++++++++++++++++++-- .../apache/metamodel/excel/ExcelConfiguration.java | 47 ++++- .../apache/metamodel/excel/ExcelDataContext.java | 2 +- .../org/apache/metamodel/excel/ExcelUtils.java | 160 ++++++++++++++- .../metamodel/excel/ExcelConfigurationTest.java | 10 +- .../metamodel/excel/ExcelDataContextTest.java | 164 ++++++++++++++++ excel/src/test/resources/different_datatypes.xls | Bin 0 -> 35840 bytes excel/src/test/resources/different_datatypes.xlsx | Bin 0 -> 9870 bytes 9 files changed, 558 insertions(+), 41 deletions(-) diff --git a/excel/pom.xml b/excel/pom.xml index 7f8e8f0..a87f50f 100644 --- a/excel/pom.xml +++ b/excel/pom.xml @@ -40,7 +40,7 @@ under the License. <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> - <version>4.0.1</version> + <version>4.1.0</version> <exclusions> <exclusion> <groupId>commons-logging</groupId> diff --git a/excel/src/main/java/org/apache/metamodel/excel/DefaultSpreadsheetReaderDelegate.java b/excel/src/main/java/org/apache/metamodel/excel/DefaultSpreadsheetReaderDelegate.java index 97ba50f..41e2b75 100644 --- a/excel/src/main/java/org/apache/metamodel/excel/DefaultSpreadsheetReaderDelegate.java +++ b/excel/src/main/java/org/apache/metamodel/excel/DefaultSpreadsheetReaderDelegate.java @@ -18,6 +18,7 @@ */ package org.apache.metamodel.excel; +import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.stream.Collectors; @@ -41,6 +42,8 @@ import org.apache.metamodel.schema.naming.ColumnNamingStrategy; import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.Resource; import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.DateUtil; +import org.apache.poi.ss.usermodel.FormulaEvaluator; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; @@ -53,6 +56,9 @@ import org.slf4j.LoggerFactory; */ final class DefaultSpreadsheetReaderDelegate implements SpreadsheetReaderDelegate { + static final ColumnType DEFAULT_COLUMN_TYPE = ColumnType.STRING; + static final ColumnType LEGACY_COLUMN_TYPE = ColumnType.VARCHAR; + private static final Logger logger = LoggerFactory.getLogger(DefaultSpreadsheetReaderDelegate.class); private final Resource _resource; @@ -128,6 +134,11 @@ final class DefaultSpreadsheetReaderDelegate implements SpreadsheetReaderDelegat row = rowIterator.next(); } + final ColumnType[] columnTypes = getColumnTypes(sheet); + if (columnTypes == null) { + return table; + } + final int columnNameLineNumber = _configuration.getColumnNameLineNumber(); if (columnNameLineNumber == ExcelConfiguration.NO_COLUMN_NAME_LINE) { @@ -146,34 +157,196 @@ final class DefaultSpreadsheetReaderDelegate implements SpreadsheetReaderDelegat columnNamingSession.getNextColumnName(new ColumnNamingContextImpl(i)); } - for (int j = offset; j < row.getLastCellNum(); j++) { - final ColumnNamingContext namingContext = new ColumnNamingContextImpl(table, null, j); + for (int i = offset; i < row.getLastCellNum(); i++) { + final ColumnNamingContext namingContext = new ColumnNamingContextImpl(table, null, i); final Column column = new MutableColumn(columnNamingSession.getNextColumnName(namingContext), - ColumnType.STRING, table, j, true); + columnTypes[i], table, i, true); table.addColumn(column); } } } else { + row = iterateToColumnNameRow(rowIterator, row); - boolean hasColumns = true; + if (row != null) { + createColumns(table, wb, row, columnTypes); + } + } - // iterate to the column name line number (if above 1) - for (int j = 1; j < columnNameLineNumber; j++) { - if (rowIterator.hasNext()) { - row = rowIterator.next(); - } else { - hasColumns = false; - break; + return table; + } + + /** + * Iterate to the column name row if the configured ColumnNameLineNumber is above 1. + * @param rowIterator + * @param currentRow + * @return Returns the column name row. Returns the current row if the configured ColumnNameLineNumber is 1. + * Returns null if the columnName row is not found. + */ + private Row iterateToColumnNameRow(final Iterator<Row> rowIterator, final Row currentRow) { + Row row = currentRow; + + // iterate to the column name line number (if above 1) + for (int i = 1; i < _configuration.getColumnNameLineNumber(); i++) { + if (rowIterator.hasNext()) { + row = rowIterator.next(); + } else { + return null; + } + } + + return row; + } + + /** + * Get an array of {@link ColumnType}s. The length of the array is determined by the header row. If there's no + * configured column name line, then the first data row is used. If the {@link ColumnType} should be detected, then + * this is done by using the data rows only. If this shouldn't be detected, then the array is filled with either + * default column type when there is no column name line or legacy column type when there is a column name line. + * @param sheet + * @return + */ + private ColumnType[] getColumnTypes(final Sheet sheet) { + // To find the array length we need the header + final Iterator<Row> iterator = ExcelUtils.getRowIterator(sheet, _configuration, false); + Row row; + if (_configuration.getColumnNameLineNumber() == ExcelConfiguration.NO_COLUMN_NAME_LINE) { + row = findTheFirstNonEmptyRow(iterator); + } else { + row = iterateToColumnNameRow(iterator, iterator.next()); + } + if (row == null) { + return null; + } + + final ColumnType[] columnTypes = new ColumnType[row.getLastCellNum()]; + + if (_configuration.isDetectColumnTypes()) { + // Now we need the first data row + row = findTheFirstNonEmptyRow(iterator); + if (row != null) { + detectColumnTypes(row, iterator, columnTypes); + } + } else { + if (_configuration.getColumnNameLineNumber() == ExcelConfiguration.NO_COLUMN_NAME_LINE) { + Arrays.fill(columnTypes, DEFAULT_COLUMN_TYPE); + } else { + Arrays.fill(columnTypes, LEGACY_COLUMN_TYPE); + } + } + return columnTypes; + } + + private static Row findTheFirstNonEmptyRow(final Iterator<Row> rowIterator) { + while (rowIterator.hasNext()) { + final Row row = rowIterator.next(); + if (row != null) { + return row; + } + } + return null; + } + + private void detectColumnTypes(Row firstRow, final Iterator<Row> dataRowIterator, final ColumnType[] columnTypes) { + detectColumnTypesFirstRow(firstRow, columnTypes); + detectColumnTypesOtherRows(dataRowIterator, columnTypes); + + // If all cells are null, then this loop sets the column type to the default + for (int i = 0; i < columnTypes.length; i++) { + if (columnTypes[i] == null) { + columnTypes[i] = DEFAULT_COLUMN_TYPE; + } + } + } + + private void detectColumnTypesFirstRow(final Row firstRow, final ColumnType[] columnTypes) { + if (firstRow != null && firstRow.getLastCellNum() > 0) { + for (int i = getColumnOffset(firstRow); i < columnTypes.length; i++) { + if (firstRow.getCell(i) != null) { + columnTypes[i] = determineColumnTypeFromCell(firstRow.getCell(i)); + } + } + } + } + + private void detectColumnTypesOtherRows(final Iterator<Row> dataRowIterator, final ColumnType[] columnTypes) { + int numberOfLinesToScan = _configuration.getNumberOfLinesToScan() - 1; + + while (dataRowIterator.hasNext() && numberOfLinesToScan-- > 0) { + final Row currentRow = dataRowIterator.next(); + if (currentRow != null && currentRow.getLastCellNum() > 0) { + for (int i = getColumnOffset(currentRow); i < columnTypes.length; i++) { + final ColumnType detectNewColumnType = detectNewColumnTypeCell(columnTypes[i], currentRow + .getCell(i)); + if (detectNewColumnType != null) { + columnTypes[i] = detectNewColumnType; + } } } + } + } - if (hasColumns) { - createColumns(table, wb, row); + /** + * Tries to detect a new {@link ColumnType} for a cell. + * @param currentColumnType + * @param cell + * @return Returns a new {@link ColumnType} when detected. Otherwise null is returned. + */ + private static ColumnType detectNewColumnTypeCell(final ColumnType currentColumnType, final Cell cell) { + // Can't detect something new if it's already on the default. + if (currentColumnType != null && currentColumnType.equals(DEFAULT_COLUMN_TYPE)) { + return null; + } + // Skip if the cell is null. This way 1 missing cell can't influence the column type of all other cells. + if (cell == null) { + return null; + } + + final ColumnType detectedColumnType = determineColumnTypeFromCell(cell); + if (currentColumnType == null) { + return detectedColumnType; + } else if (!currentColumnType.equals(detectedColumnType)) { + // If the column type is Double and a Integer is detected, then don't set it to Integer + if (currentColumnType.equals(ColumnType.INTEGER) && detectedColumnType.equals(ColumnType.DOUBLE)) { + // If the column type is Integer and a Double is detected, then set it to Double + return detectedColumnType; + } else if (currentColumnType.equals(ColumnType.DOUBLE) && detectedColumnType.equals(ColumnType.INTEGER)) { + return null; + } else { + return DEFAULT_COLUMN_TYPE; } } + return null; + } - return table; + private static ColumnType determineColumnTypeFromCell(final Cell cell) { + switch (cell.getCellType()) { + case NUMERIC: + if (DateUtil.isCellDateFormatted(cell)) { + return ColumnType.DATE; + } else { + return cell.getNumericCellValue() % 1 == 0 ? ColumnType.INTEGER : ColumnType.DOUBLE; + } + case BOOLEAN: + return ColumnType.BOOLEAN; + case FORMULA: + final FormulaEvaluator evaluator = cell + .getSheet() + .getWorkbook() + .getCreationHelper() + .createFormulaEvaluator(); + return determineColumnTypeFromCell(evaluator.evaluateInCell(cell)); + case STRING: + // fall through + case BLANK: + // fall through + case _NONE: + // fall through + case ERROR: + // fall through + default: + return DEFAULT_COLUMN_TYPE; + } } /** @@ -183,7 +356,8 @@ final class DefaultSpreadsheetReaderDelegate implements SpreadsheetReaderDelegat * @param wb * @param row */ - private void createColumns(MutableTable table, Workbook wb, Row row) { + private void createColumns(final MutableTable table, final Workbook wb, final Row row, + final ColumnType[] columnTypes) { if (row == null) { logger.warn("Cannot create columns based on null row!"); return; @@ -195,13 +369,13 @@ final class DefaultSpreadsheetReaderDelegate implements SpreadsheetReaderDelegat // build columns based on cell values. try (final ColumnNamingSession columnNamingSession = _configuration.getColumnNamingStrategy() .startColumnNamingSession()) { - for (int j = offset; j < rowLength; j++) { - final Cell cell = row.getCell(j); - final String intrinsicColumnName = ExcelUtils.getCellValue(wb, cell); + for (int i = offset; i < rowLength; i++) { + final Cell cell = row.getCell(i); + final String intrinsicColumnName = (String) ExcelUtils.getCellValue(wb, cell); final ColumnNamingContext columnNamingContext = new ColumnNamingContextImpl(table, intrinsicColumnName, - j); + i); final String columnName = columnNamingSession.getNextColumnName(columnNamingContext); - final Column column = new MutableColumn(columnName, ColumnType.VARCHAR, table, j, true); + final Column column = new MutableColumn(columnName, columnTypes[i], table, i, true); table.addColumn(column); } } diff --git a/excel/src/main/java/org/apache/metamodel/excel/ExcelConfiguration.java b/excel/src/main/java/org/apache/metamodel/excel/ExcelConfiguration.java index 4779bb1..52d2ad5 100644 --- a/excel/src/main/java/org/apache/metamodel/excel/ExcelConfiguration.java +++ b/excel/src/main/java/org/apache/metamodel/excel/ExcelConfiguration.java @@ -37,26 +37,39 @@ public final class ExcelConfiguration extends BaseObject implements public static final int NO_COLUMN_NAME_LINE = 0; public static final int DEFAULT_COLUMN_NAME_LINE = 1; + public static final int DEFAULT_NUMBERS_OF_LINES_TO_SCAN = 10; private final int columnNameLineNumber; private final ColumnNamingStrategy columnNamingStrategy; private final boolean skipEmptyLines; private final boolean skipEmptyColumns; + private final boolean detectColumnTypes; + private final int numberOfLinesToScan; public ExcelConfiguration() { this(DEFAULT_COLUMN_NAME_LINE, true, false); } - public ExcelConfiguration(int columnNameLineNumber, boolean skipEmptyLines, boolean skipEmptyColumns) { + public ExcelConfiguration(final int columnNameLineNumber, final boolean skipEmptyLines, + final boolean skipEmptyColumns) { this(columnNameLineNumber, null, skipEmptyLines, skipEmptyColumns); } - public ExcelConfiguration(int columnNameLineNumber, ColumnNamingStrategy columnNamingStrategy, - boolean skipEmptyLines, boolean skipEmptyColumns) { + public ExcelConfiguration(final int columnNameLineNumber, final ColumnNamingStrategy columnNamingStrategy, + final boolean skipEmptyLines, final boolean skipEmptyColumns) { + this(columnNameLineNumber, columnNamingStrategy, skipEmptyLines, skipEmptyColumns, false, + DEFAULT_NUMBERS_OF_LINES_TO_SCAN); + } + + public ExcelConfiguration(final int columnNameLineNumber, final ColumnNamingStrategy columnNamingStrategy, + final boolean skipEmptyLines, final boolean skipEmptyColumns, final boolean detectColumnTypes, + final int numberOfLinesToScan) { this.columnNameLineNumber = columnNameLineNumber; this.skipEmptyLines = skipEmptyLines; this.skipEmptyColumns = skipEmptyColumns; this.columnNamingStrategy = columnNamingStrategy; + this.detectColumnTypes = detectColumnTypes; + this.numberOfLinesToScan = numberOfLinesToScan; } /** @@ -102,17 +115,39 @@ public final class ExcelConfiguration extends BaseObject implements return skipEmptyColumns; } + /** + * Defines if columns in the excel spreadsheet should be validated on datatypes while + * reading the spreadsheet. + * + * @return a boolean indicating whether or not to validate column types. + */ + public boolean isDetectColumnTypes() { + return detectColumnTypes; + } + + /** + * The number of lines to scan when detecting the column types + * + * @return an int indicating the numbers of lines that will be scanned + */ + public int getNumberOfLinesToScan() { + return numberOfLinesToScan; + } + @Override protected void decorateIdentity(List<Object> identifiers) { identifiers.add(columnNameLineNumber); identifiers.add(skipEmptyLines); identifiers.add(skipEmptyColumns); + identifiers.add(detectColumnTypes); + identifiers.add(numberOfLinesToScan); } @Override public String toString() { - return "ExcelConfiguration[columnNameLineNumber=" - + columnNameLineNumber + ", skipEmptyLines=" + skipEmptyLines - + ", skipEmptyColumns=" + skipEmptyColumns + "]"; + return String + .format("ExcelConfiguration[columnNameLineNumber=%s, skipEmptyLines=%s, skipEmptyColumns=%s, " + + "detectColumnTypes=%s, numbersOfLinesToScan=%s]", columnNameLineNumber, skipEmptyLines, + skipEmptyColumns, detectColumnTypes, numberOfLinesToScan); } } diff --git a/excel/src/main/java/org/apache/metamodel/excel/ExcelDataContext.java b/excel/src/main/java/org/apache/metamodel/excel/ExcelDataContext.java index b1f8149..0829109 100644 --- a/excel/src/main/java/org/apache/metamodel/excel/ExcelDataContext.java +++ b/excel/src/main/java/org/apache/metamodel/excel/ExcelDataContext.java @@ -189,7 +189,7 @@ public final class ExcelDataContext extends QueryPostprocessDataContext implemen if (_spreadsheetReaderDelegate == null) { _spreadsheetReaderDelegate = _resource.read(in -> { try { - if (FileMagic.valueOf(in) == FileMagic.OOXML) { + if (FileMagic.valueOf(in) == FileMagic.OOXML && !_configuration.isDetectColumnTypes()) { return new XlsxSpreadsheetReaderDelegate(_resource, _configuration); } else { return new DefaultSpreadsheetReaderDelegate(_resource, _configuration); diff --git a/excel/src/main/java/org/apache/metamodel/excel/ExcelUtils.java b/excel/src/main/java/org/apache/metamodel/excel/ExcelUtils.java index 2da6ef3..f05e86c 100644 --- a/excel/src/main/java/org/apache/metamodel/excel/ExcelUtils.java +++ b/excel/src/main/java/org/apache/metamodel/excel/ExcelUtils.java @@ -19,7 +19,6 @@ package org.apache.metamodel.excel; import java.io.File; -import java.text.NumberFormat; import java.util.Date; import java.util.Iterator; import java.util.List; @@ -37,11 +36,11 @@ import org.apache.metamodel.data.Style; import org.apache.metamodel.data.Style.SizeUnit; import org.apache.metamodel.data.StyleBuilder; import org.apache.metamodel.query.SelectItem; +import org.apache.metamodel.schema.ColumnType; import org.apache.metamodel.schema.Table; import org.apache.metamodel.util.DateUtils; import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.FileResource; -import org.apache.metamodel.util.FormatHelper; import org.apache.metamodel.util.InMemoryResource; import org.apache.metamodel.util.Resource; import org.apache.poi.hssf.usermodel.HSSFDateUtil; @@ -49,9 +48,11 @@ import org.apache.poi.hssf.usermodel.HSSFFont; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hssf.util.HSSFColor; import org.apache.poi.ss.formula.FormulaParseException; +import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.CellStyle; import org.apache.poi.ss.usermodel.Color; +import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.FillPatternType; import org.apache.poi.ss.usermodel.Font; import org.apache.poi.ss.usermodel.FontUnderline; @@ -76,8 +77,6 @@ final class ExcelUtils { private static final Logger logger = LoggerFactory.getLogger(ExcelUtils.class); - private static final NumberFormat _numberFormat = FormatHelper.getUiNumberFormat(); - private ExcelUtils() { // prevent instantiation } @@ -222,7 +221,62 @@ final class ExcelUtils { } else { // TODO: Consider not formatting it, but simple using // Double.toString(...) - result = _numberFormat.format(cell.getNumericCellValue()); + result = getNumericCellValueAsStringUsingBuildinFormat(cell.getCellStyle(), cell.getNumericCellValue()); + } + break; + case STRING: + result = cell.getRichStringCellValue().getString(); + break; + default: + throw new IllegalStateException("Unknown cell type: " + cell.getCellType()); + } + + logger.debug("cell {} resolved to value: {}", cellCoordinate, result); + + return result; + } + + public static Object getCellValueAsObject(Workbook wb, Cell cell) { + if (cell == null) { + return null; + } + + final String cellCoordinate = "(" + cell.getRowIndex() + "," + cell.getColumnIndex() + ")"; + + final Object result; + + switch (cell.getCellType()) { + case BLANK: + case _NONE: + result = null; + break; + case BOOLEAN: + result = Boolean.valueOf(cell.getBooleanCellValue()); + break; + case ERROR: + String errorResult; + try { + errorResult = FormulaError.forInt(cell.getErrorCellValue()).getString(); + } catch (RuntimeException e) { + logger.debug("Getting error code for {} failed!: {}", cellCoordinate, e.getMessage()); + if (cell instanceof XSSFCell) { + // hack to get error string, which is available + errorResult = ((XSSFCell) cell).getErrorCellString(); + } else { + logger.error("Couldn't handle unexpected error scenario in cell: " + cellCoordinate, e); + throw e; + } + } + result = errorResult; + break; + case FORMULA: + result = getFormulaCellValueAsObject(wb, cell); + break; + case NUMERIC: + if (HSSFDateUtil.isCellDateFormatted(cell)) { + result = cell.getDateCellValue(); + } else { + result = getIntegerOrDoubleValueFromDouble(cell.getNumericCellValue()); } break; case STRING: @@ -237,13 +291,34 @@ final class ExcelUtils { return result; } + public static Object getCellValueChecked(final Workbook wb, final Cell cell, final ColumnType expectedColumnType) { + final Object value = getCellValueAsObject(wb, cell); + if (value == null || value.getClass().equals(expectedColumnType.getJavaEquivalentClass())) { + return value; + } + + if (!(value.getClass().equals(Integer.class) && expectedColumnType + .getJavaEquivalentClass() + .equals(Double.class)) && logger.isWarnEnabled()) { + // Don't log when a Integer value is in a Double column type + final String cellCoordinate = "(" + cell.getRowIndex() + "," + cell.getColumnIndex() + ")"; + final String warning = String + .format("Cell %s has the value '%s' of data type '%s', which doesn't match the detected column's " + + "data type '%s'. This cell gets value NULL in the DataSet.", cellCoordinate, value, value + .getClass() + .getSimpleName(), expectedColumnType); + logger.warn(warning); + } + return null; + } + private static String getFormulaCellValue(Workbook wb, Cell cell) { // first try with a cached/precalculated value try { double numericCellValue = cell.getNumericCellValue(); // TODO: Consider not formatting it, but simple using // Double.toString(...) - return _numberFormat.format(numericCellValue); + return getNumericCellValueAsStringUsingBuildinFormat(cell.getCellStyle(), numericCellValue); } catch (Exception e) { if (logger.isInfoEnabled()) { logger.info("Failed to fetch cached/precalculated formula value of cell: " + cell, e); @@ -281,6 +356,57 @@ final class ExcelUtils { return cell.getCellFormula(); } + private static Object getFormulaCellValueAsObject(final Workbook wb, final Cell cell) { + // first try with a cached/precalculated value + try { + return getIntegerOrDoubleValueFromDouble(cell.getNumericCellValue()); + } catch (Exception e) { + if (logger.isInfoEnabled()) { + logger.info("Failed to fetch cached/precalculated formula value of cell: " + cell, e); + } + } + + // evaluate cell first, if possible + try { + if (logger.isInfoEnabled()) { + logger + .info("cell({},{}) is a formula. Attempting to evaluate: {}", cell.getRowIndex(), cell + .getColumnIndex(), cell.getCellFormula()); + } + + final FormulaEvaluator evaluator = wb.getCreationHelper().createFormulaEvaluator(); + + // calculates the formula and puts it's value back into the cell + final Cell evaluatedCell = evaluator.evaluateInCell(cell); + + return getCellValueAsObject(wb, evaluatedCell); + } catch (RuntimeException e) { + logger + .warn("Exception occurred while evaluating formula at position ({},{}): {}", cell.getRowIndex(), + cell.getColumnIndex(), e.getMessage()); + // Some exceptions we simply log - result will be then be the actual formula + if (e instanceof FormulaParseException) { + logger.error("Parse exception occurred while evaluating cell formula: " + cell, e); + } else if (e instanceof IllegalArgumentException) { + logger.error("Illegal formula argument occurred while evaluating cell formula: " + cell, e); + } else { + logger.error("Unexpected exception occurred while evaluating cell formula: " + cell, e); + } + } + + // last resort: return the string formula + return cell.getCellFormula(); + } + + private static Number getIntegerOrDoubleValueFromDouble(double value) { + final Double doubleValue = Double.valueOf(value); + if (doubleValue % 1 == 0 && doubleValue <= Integer.MAX_VALUE) { + return Integer.valueOf(doubleValue.intValue()); + } else { + return doubleValue; + } + } + public static Style getCellStyle(Workbook workbook, Cell cell) { if (cell == null) { return Style.NO_STYLE; @@ -414,13 +540,21 @@ final class ExcelUtils { */ public static DefaultRow createRow(Workbook workbook, Row row, DataSetHeader header) { final int size = header.size(); - final String[] values = new String[size]; + final Object[] values = new Object[size]; final Style[] styles = new Style[size]; if (row != null) { for (int i = 0; i < size; i++) { final int columnNumber = header.getSelectItem(i).getColumn().getColumnNumber(); + final ColumnType columnType = header.getSelectItem(i).getColumn().getType(); final Cell cell = row.getCell(columnNumber); - final String value = ExcelUtils.getCellValue(workbook, cell); + final Object value; + if (columnType.equals(DefaultSpreadsheetReaderDelegate.DEFAULT_COLUMN_TYPE) || columnType + .equals(DefaultSpreadsheetReaderDelegate.LEGACY_COLUMN_TYPE)) { + value = ExcelUtils.getCellValue(workbook, cell); + } else { + value = ExcelUtils.getCellValueChecked(workbook, cell, columnType); + } + final Style style = ExcelUtils.getCellStyle(workbook, cell); values[i] = value; styles[i] = style; @@ -443,4 +577,14 @@ final class ExcelUtils { final DataSet dataSet = new XlsDataSet(selectItems, workbook, rowIterator); return dataSet; } + + private static String getNumericCellValueAsStringUsingBuildinFormat(CellStyle cellStyle, double cellValue) { + int formatIndex = cellStyle.getDataFormat(); + String formatString = cellStyle.getDataFormatString(); + if (formatString == null) { + formatString = BuiltinFormats.getBuiltinFormat(formatIndex); + } + final DataFormatter formatter = new DataFormatter(); + return formatter.formatRawCellContents(cellValue, formatIndex, formatString); + } } diff --git a/excel/src/test/java/org/apache/metamodel/excel/ExcelConfigurationTest.java b/excel/src/test/java/org/apache/metamodel/excel/ExcelConfigurationTest.java index 6e7559c..a652e72 100644 --- a/excel/src/test/java/org/apache/metamodel/excel/ExcelConfigurationTest.java +++ b/excel/src/test/java/org/apache/metamodel/excel/ExcelConfigurationTest.java @@ -18,16 +18,16 @@ */ package org.apache.metamodel.excel; -import org.apache.metamodel.excel.ExcelConfiguration; - import junit.framework.TestCase; public class ExcelConfigurationTest extends TestCase { public void testToString() throws Exception { - ExcelConfiguration conf = new ExcelConfiguration(1, true, false); - assertEquals( - "ExcelConfiguration[columnNameLineNumber=1, skipEmptyLines=true, skipEmptyColumns=false]", + final ExcelConfiguration conf = new ExcelConfiguration(1, true, false); + assertEquals(String + .format("ExcelConfiguration[columnNameLineNumber=%s, skipEmptyLines=%s, skipEmptyColumns=%s, " + + "detectColumnTypes=%s, numbersOfLinesToScan=%s]", ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, + true, false, false, ExcelConfiguration.DEFAULT_NUMBERS_OF_LINES_TO_SCAN), conf.toString()); } diff --git a/excel/src/test/java/org/apache/metamodel/excel/ExcelDataContextTest.java b/excel/src/test/java/org/apache/metamodel/excel/ExcelDataContextTest.java index 13cfdec..3d05c83 100644 --- a/excel/src/test/java/org/apache/metamodel/excel/ExcelDataContextTest.java +++ b/excel/src/test/java/org/apache/metamodel/excel/ExcelDataContextTest.java @@ -21,8 +21,10 @@ package org.apache.metamodel.excel; import java.io.File; import java.util.Arrays; import java.util.List; +import java.util.stream.IntStream; import org.apache.metamodel.DataContext; +import org.apache.metamodel.MetaModelException; import org.apache.metamodel.MetaModelHelper; import org.apache.metamodel.UpdateCallback; import org.apache.metamodel.UpdateScript; @@ -30,15 +32,19 @@ import org.apache.metamodel.data.DataSet; import org.apache.metamodel.data.Row; import org.apache.metamodel.data.Style; import org.apache.metamodel.data.StyleBuilder; +import org.apache.metamodel.insert.InsertInto; import org.apache.metamodel.query.Query; import org.apache.metamodel.schema.Column; +import org.apache.metamodel.schema.ColumnType; import org.apache.metamodel.schema.Schema; import org.apache.metamodel.schema.Table; import org.apache.metamodel.schema.naming.CustomColumnNamingStrategy; +import org.apache.metamodel.update.Update; import org.apache.metamodel.util.DateUtils; import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.FileResource; import org.apache.metamodel.util.Month; +import org.junit.Test; import junit.framework.TestCase; @@ -792,4 +798,162 @@ public class ExcelDataContextTest extends TestCase { assertNotNull(table.getColumnByName(secondColumnName)); assertNotNull(table.getColumnByName(thirdColumnName)); } + + public void testDetectingDifferentDataTypesInXls() throws Exception { + detectingDataTypes("src/test/resources/different_datatypes.xls"); + } + + public void testDifferentDataTypesInXlsx() throws Exception { + detectingDataTypes("src/test/resources/different_datatypes.xlsx"); + } + + private void detectingDataTypes(final String file) { + final DataContext dataContext = new ExcelDataContext(copyOf(file), new ExcelConfiguration( + ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, null, true, false, true, 19)); + + final Schema schema = dataContext.getDefaultSchema(); + assertEquals(2, schema.getTableCount()); + + final Table table = schema.getTables().get(0); + assertEquals("INTEGER", table.getColumns().get(0).getName()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(0).getType()); + assertEquals("TEXT", table.getColumns().get(1).getName()); + assertEquals(ColumnType.STRING, table.getColumns().get(1).getType()); + assertEquals("FORMULA", table.getColumns().get(2).getName()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(2).getType()); + assertEquals("MIXING_DOUBLE_AND_INT", table.getColumns().get(3).getName()); + assertEquals(ColumnType.DOUBLE, table.getColumns().get(3).getType()); + assertEquals("MIXING_OTHER_DATATYPES", table.getColumns().get(4).getName()); + assertEquals(ColumnType.STRING, table.getColumns().get(4).getType()); + final DataSet countDataSet = dataContext.query().from(table).selectCount().execute(); + assertTrue(countDataSet.next()); + assertEquals(20L, countDataSet.getRow().getValue(0)); + assertFalse(countDataSet.next()); + } + + public void testCellValueWithWrongDatatypeIsSetToNull() { + // Unless Integers and Doubles are mixed all other incorrect values will be converted to null with a warning + final DataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/different_datatypes.xls"), + new ExcelConfiguration(ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, null, true, false, true, 19)); + final Table table = dataContext.getDefaultSchema().getTables().get(0); + final DataSet testWrongDatatypeDataSet = dataContext + .query() + .from(table) + .select("MIXING_DOUBLE_AND_INT") + .execute(); + IntStream.range(0, 20).forEach(i -> assertTrue(testWrongDatatypeDataSet.next())); + assertNull(testWrongDatatypeDataSet.getRow().getValue(0)); + assertFalse(testWrongDatatypeDataSet.next()); + } + + public void testDetectingDataTypeNotSkippingLinesAndColumnsUsingNameLine() { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/skipped_lines.xlsx"), + new ExcelConfiguration(6, null, false, false, true, 3)); + final Table table = dataContext.getDefaultSchema().getTables().get(0); + assertEquals(ColumnType.STRING, table.getColumns().get(0).getType()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(6).getType()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(7).getType()); + } + + public void testDetectingDataTypeNotSkippingLinesAndColumnsNoNameLine() { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/skipped_lines.xlsx"), + new ExcelConfiguration(ExcelConfiguration.NO_COLUMN_NAME_LINE, null, false, false, true, 3)); + final Table table = dataContext.getDefaultSchema().getTables().get(0); + assertEquals(ColumnType.STRING, table.getColumns().get(0).getType()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(6).getType()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(7).getType()); + } + + public void testDetectingDataTypeSkippingLinesAndColumnsUsingNameLine() { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/skipped_lines.xlsx"), + new ExcelConfiguration(ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, null, true, true, true, 3)); + final Table table = dataContext.getDefaultSchema().getTables().get(0); + assertEquals(ColumnType.INTEGER, table.getColumns().get(0).getType()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(1).getType()); + } + + public void testDetectingDataTypeSkippingLinesAndColumnsNoNameLine() { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/skipped_lines.xlsx"), + new ExcelConfiguration(ExcelConfiguration.NO_COLUMN_NAME_LINE, null, true, true, true, 3)); + final Table table = dataContext.getDefaultSchema().getTables().get(0); + assertEquals(ColumnType.INTEGER, table.getColumns().get(0).getType()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(1).getType()); + } + + public void testToMuchNumberOfLinesToScan() throws Exception { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/skipped_lines.xlsx"), + new ExcelConfiguration(ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, null, true, true, true, 4)); + + final Table table = dataContext.getDefaultSchema().getTables().get(0); + assertEquals("hello", table.getColumns().get(0).getName()); + assertEquals("world", table.getColumns().get(1).getName()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(0).getType()); + assertEquals(ColumnType.INTEGER, table.getColumns().get(1).getType()); + final DataSet dataSet = dataContext.query().from(table).selectCount().execute(); + assertTrue(dataSet.next()); + assertEquals(3L, dataSet.getRow().getValue(0)); + } + + public void testToMissingValueDoesntEffectDetectedType() throws Exception { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/xls_missing_values.xls"), + new ExcelConfiguration(ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, null, true, false, true, 3)); + + final Table table = dataContext.getDefaultSchema().getTables().get(0); + final Column columnB = table.getColumns().get(1); + assertEquals("b", columnB.getName()); + assertEquals(ColumnType.INTEGER, columnB.getType()); + final DataSet dataSetColumnB = dataContext.query().from(table).select(columnB).execute(); + assertTrue(dataSetColumnB.next()); + assertTrue(dataSetColumnB.next()); + assertNull(dataSetColumnB.getRow().getValue(0)); + + final Column columnD = table.getColumns().get(3); + assertEquals("d", columnD.getName()); + assertEquals(ColumnType.INTEGER, columnD.getType()); + final DataSet dataSetColumnD = dataContext + .query() + .from(table) + .select(columnD) + .where(columnD) + .eq(12) + .execute(); + assertTrue(dataSetColumnD.next()); + assertEquals(12, dataSetColumnD.getRow().getValue(0)); + } + + + public void testInsertingValueOfValidColumnType() { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/different_datatypes.xls"), + new ExcelConfiguration(ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, null, true, false, true, 19)); + final Table table = dataContext.getDefaultSchema().getTable(0); + dataContext.executeUpdate(new InsertInto(table).value("INTEGER", 123)); + final DataSet dataSet = dataContext.query().from(table).selectAll().where("INTEGER").eq(123).execute(); + assertTrue(dataSet.next()); + } + + @Test(expected = MetaModelException.class) + public void testInsertingValueOfInvalidColumnType() { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/different_datatypes.xls"), + new ExcelConfiguration(ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, null, true, false, true, 19)); + final Table table = dataContext.getDefaultSchema().getTable(0); + dataContext.executeUpdate(new InsertInto(table).value("INTEGER", "this is not an integer")); + } + + public void testUpdatingValueOfValidColumnType() { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/different_datatypes.xls"), + new ExcelConfiguration(ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, null, true, false, true, 19)); + final Table table = dataContext.getDefaultSchema().getTable(0); + dataContext.executeUpdate(new Update(table).value("INTEGER", 1).value("INTEGER", 123)); + final DataSet dataSet = dataContext.query().from(table).selectAll().where("INTEGER").eq(123).execute(); + assertTrue(dataSet.next()); + } + + @Test(expected = MetaModelException.class) + public void testUpdatingValueOfInvalidColumnType() { + final ExcelDataContext dataContext = new ExcelDataContext(copyOf("src/test/resources/different_datatypes.xls"), + new ExcelConfiguration(ExcelConfiguration.DEFAULT_COLUMN_NAME_LINE, null, true, false, true, 19)); + final Table table = dataContext.getDefaultSchema().getTable(0); + dataContext.executeUpdate(new Update(table).value("INTEGER", 1).value("INTEGER", "this is not an integer")); + } + } \ No newline at end of file diff --git a/excel/src/test/resources/different_datatypes.xls b/excel/src/test/resources/different_datatypes.xls new file mode 100644 index 0000000..1cdb28e Binary files /dev/null and b/excel/src/test/resources/different_datatypes.xls differ diff --git a/excel/src/test/resources/different_datatypes.xlsx b/excel/src/test/resources/different_datatypes.xlsx new file mode 100644 index 0000000..a7be4d0 Binary files /dev/null and b/excel/src/test/resources/different_datatypes.xlsx differ