This is an automated email from the ASF dual-hosted git repository.
exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new 3abb62020d NIFI-14579 Added Row Evaluation Strategy property for Excel
Schema Inference (#10000)
3abb62020d is described below
commit 3abb62020d90de859c3ed6d08e892a34ae5d6c59
Author: dan-s1 <[email protected]>
AuthorDate: Tue Jul 8 23:09:46 2025 -0400
NIFI-14579 Added Row Evaluation Strategy property for Excel Schema
Inference (#10000)
- Aligned the Use Starting Row strategy to use a similar approach as the
Infer Schema strategy
- Added a Row Evaluation Strategy property for using the starting row as
the schema field names, with either Standard or All Rows options
Signed-off-by: David Handermann <[email protected]>
---
.../java/org/apache/nifi/excel/ExcelReader.java | 47 ++++--
...y.java => ExcelStartingRowSchemaInference.java} | 97 +++++-------
.../apache/nifi/excel/RowEvaluationStrategy.java | 49 ++++++
...va => TestExcelStartingRowSchemaInference.java} | 164 ++++++++++++---------
4 files changed, 219 insertions(+), 138 deletions(-)
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
index 1bcb601857..05cd317cc0 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
@@ -95,13 +95,24 @@ public class ExcelReader extends SchemaRegistryService
implements RecordReaderFa
.displayName("Starting Row")
.description("The row number of the first row to start processing
(One based)."
+ " Use this to skip over rows of data at the top of a
worksheet that are not part of the dataset."
- + " When using the '" +
ExcelHeaderSchemaStrategy.USE_STARTING_ROW.getValue() + "' strategy this should
be the column header row.")
+ + " When using the '" +
ExcelStartingRowSchemaInference.USE_STARTING_ROW.getValue() + "' strategy this
should be the column header row.")
.required(true)
.defaultValue("1")
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR)
.build();
+ public static final PropertyDescriptor ROW_EVALUATION_STRATEGY = new
PropertyDescriptor
+ .Builder().name("Row Evaluation Strategy")
+ .displayName("Row Evaluation Strategy")
+ .description("A strategy to select how many rows after the
starting row to use for determining the schema.")
+ .required(true)
+ .allowableValues(RowEvaluationStrategy.class)
+ .defaultValue(RowEvaluationStrategy.STANDARD)
+ .dependsOn(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY,
ExcelStartingRowSchemaInference.USE_STARTING_ROW)
+ .addValidator(StandardValidators.NON_BLANK_VALIDATOR)
+ .build();
+
public static final PropertyDescriptor REQUIRED_SHEETS = new
PropertyDescriptor
.Builder().name("Required Sheets")
.displayName("Required Sheets")
@@ -159,6 +170,7 @@ public class ExcelReader extends SchemaRegistryService
implements RecordReaderFa
properties.add(PROTECTION_TYPE);
properties.add(PASSWORD);
properties.add(STARTING_ROW);
+ properties.add(ROW_EVALUATION_STRATEGY);
properties.add(REQUIRED_SHEETS);
properties.add(DateTimeUtils.DATE_FORMAT);
properties.add(DateTimeUtils.TIME_FORMAT);
@@ -169,12 +181,18 @@ public class ExcelReader extends SchemaRegistryService
implements RecordReaderFa
@Override
protected SchemaAccessStrategy getSchemaAccessStrategy(final String
allowableValue, final SchemaRegistry schemaRegistry, final PropertyContext
context) {
- if
(allowableValue.equalsIgnoreCase(ExcelHeaderSchemaStrategy.USE_STARTING_ROW.getValue()))
{
- return new ExcelHeaderSchemaStrategy(context, getLogger(), new
TimeValueInference(dateFormat, timeFormat, timestampFormat));
+ if
(ExcelStartingRowSchemaInference.USE_STARTING_ROW.getValue().equals(allowableValue))
{
+ final RowEvaluationStrategy rowEvaluationStrategy =
+
context.getProperty(ROW_EVALUATION_STRATEGY).asAllowableValue(RowEvaluationStrategy.class);
+ final int firstRow = context.getProperty(STARTING_ROW)
+ .evaluateAttributeExpressions()
+ .asInteger();
+ final SchemaInferenceEngine<Row> inference =
+ new ExcelStartingRowSchemaInference(rowEvaluationStrategy,
firstRow, createTimeValueInference());
+ return createInferSchemaAccessStrategy(context, inference);
} else if
(SchemaInferenceUtil.INFER_SCHEMA.getValue().equals(allowableValue)) {
- final RecordSourceFactory<Row> sourceFactory = (variables, in) ->
new ExcelRecordSource(in, context, variables, getLogger());
- final SchemaInferenceEngine<Row> inference = new
ExcelSchemaInference(new TimeValueInference(dateFormat, timeFormat,
timestampFormat));
- return new InferSchemaAccessStrategy<>(sourceFactory, inference,
getLogger());
+ final SchemaInferenceEngine<Row> inference = new
ExcelSchemaInference(createTimeValueInference());
+ return createInferSchemaAccessStrategy(context, inference);
}
return super.getSchemaAccessStrategy(allowableValue, schemaRegistry,
context);
@@ -183,21 +201,21 @@ public class ExcelReader extends SchemaRegistryService
implements RecordReaderFa
@Override
protected List<AllowableValue> getSchemaAccessStrategyValues() {
final List<AllowableValue> allowableValues = new
ArrayList<>(super.getSchemaAccessStrategyValues());
- allowableValues.add(ExcelHeaderSchemaStrategy.USE_STARTING_ROW);
+ allowableValues.add(ExcelStartingRowSchemaInference.USE_STARTING_ROW);
allowableValues.add(SchemaInferenceUtil.INFER_SCHEMA);
return allowableValues;
}
@Override
protected AllowableValue getDefaultSchemaAccessStrategy() {
- return ExcelHeaderSchemaStrategy.USE_STARTING_ROW;
+ return ExcelStartingRowSchemaInference.USE_STARTING_ROW;
}
private int getStartingRow(final Map<String, String> variables) {
int rawStartingRow =
configurationContext.getProperty(STARTING_ROW).evaluateAttributeExpressions(variables).asInteger();
- String schemaAccessStrategy =
configurationContext.getProperty(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY).getValue();
+ final String schemaAccessStrategy =
configurationContext.getProperty(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY).getValue();
- if
(ExcelHeaderSchemaStrategy.USE_STARTING_ROW.getValue().equals(schemaAccessStrategy))
{
+ if
(ExcelStartingRowSchemaInference.USE_STARTING_ROW.getValue().equals(schemaAccessStrategy))
{
rawStartingRow++;
}
return getZeroBasedIndex(rawStartingRow);
@@ -222,4 +240,13 @@ public class ExcelReader extends SchemaRegistryService
implements RecordReaderFa
return Collections.emptyList();
}
+
+ private TimeValueInference createTimeValueInference() {
+ return new TimeValueInference(dateFormat, timeFormat, timestampFormat);
+ }
+
+ private InferSchemaAccessStrategy<Row>
createInferSchemaAccessStrategy(final PropertyContext context, final
SchemaInferenceEngine<Row> inference) {
+ final RecordSourceFactory<Row> sourceFactory = (variables, in) -> new
ExcelRecordSource(in, context, variables, getLogger());
+ return new InferSchemaAccessStrategy<>(sourceFactory, inference,
getLogger());
+ }
}
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelStartingRowSchemaInference.java
similarity index 60%
rename from
nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
rename to
nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelStartingRowSchemaInference.java
index edf80710a7..4a6c0f4544 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelStartingRowSchemaInference.java
@@ -16,14 +16,11 @@
*/
package org.apache.nifi.excel;
-import org.apache.commons.lang3.math.NumberUtils;
import org.apache.nifi.components.AllowableValue;
-import org.apache.nifi.context.PropertyContext;
-import org.apache.nifi.logging.ComponentLog;
-import org.apache.nifi.schema.access.SchemaAccessStrategy;
-import org.apache.nifi.schema.access.SchemaField;
import org.apache.nifi.schema.access.SchemaNotFoundException;
import org.apache.nifi.schema.inference.FieldTypeInference;
+import org.apache.nifi.schema.inference.RecordSource;
+import org.apache.nifi.schema.inference.SchemaInferenceEngine;
import org.apache.nifi.schema.inference.TimeValueInference;
import org.apache.nifi.serialization.SimpleRecordSchema;
import org.apache.nifi.serialization.record.RecordField;
@@ -32,89 +29,67 @@ import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
-import java.io.InputStream;
+import java.io.IOException;
import java.util.ArrayList;
-import java.util.EnumSet;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
-import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
-public class ExcelHeaderSchemaStrategy implements SchemaAccessStrategy {
- private static final Set<SchemaField> schemaFields =
EnumSet.noneOf(SchemaField.class);
- static final int NUM_ROWS_TO_DETERMINE_TYPES = 10; // NOTE: This number is
arbitrary.
+public class ExcelStartingRowSchemaInference implements
SchemaInferenceEngine<Row> {
+
static final AllowableValue USE_STARTING_ROW = new AllowableValue("Use
Starting Row", "Use Starting Row",
"The configured first row of the Excel file is a header line that
contains the names of the columns. The schema will be derived by using the "
- + "column names in the header of the first sheet and the
following " + NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of
each column "
- + "while the configured header rows of subsequent sheets
are skipped. "
+ + "column names in the header of the first sheet and
dependent on the strategy chosen either the subsequent "
+ + RowEvaluationStrategy.NUM_ROWS_TO_DETERMINE_TYPES + "
rows or all of the subsequent rows. However the configured header rows of
subsequent sheets are skipped. "
+ "NOTE: If there are duplicate column names then each
subsequent duplicate column name is given a one up number. "
- + "For example, column names \"Name\", \"Name\" will be
changed to \"Name\", \"Name_1\"");
-
- private final PropertyContext context;
- private final ComponentLog logger;
+ + "For example, column names \"Name\", \"Name\" will be
changed to \"Name\", \"Name_1\".");
+ private final RowEvaluationStrategy rowEvaluationStrategy;
+ private final int firstRow;
private final CellFieldTypeReader cellFieldTypeReader;
private final DataFormatter dataFormatter;
- public ExcelHeaderSchemaStrategy(PropertyContext context, ComponentLog
logger, TimeValueInference timeValueInference) {
- this.context = context;
- this.logger = logger;
+ public ExcelStartingRowSchemaInference(RowEvaluationStrategy
rowEvaluationStrategy, int firstRow, TimeValueInference timeValueInference) {
+ this.rowEvaluationStrategy = rowEvaluationStrategy;
+ this.firstRow = firstRow;
this.cellFieldTypeReader = new
StandardCellFieldTypeReader(timeValueInference);
this.dataFormatter = new DataFormatter();
}
@Override
- public RecordSchema getSchema(Map<String, String> variables, InputStream
contentStream, RecordSchema readSchema) throws SchemaNotFoundException {
- if (this.context == null) {
- throw new SchemaNotFoundException("Schema Access Strategy intended
only for validation purposes and cannot obtain schema");
- }
-
- final String requiredSheetsDelimited =
context.getProperty(ExcelReader.REQUIRED_SHEETS).evaluateAttributeExpressions(variables).getValue();
- final List<String> requiredSheets =
ExcelReader.getRequiredSheets(requiredSheetsDelimited);
- final Integer rawFirstRow =
context.getProperty(ExcelReader.STARTING_ROW).evaluateAttributeExpressions(variables).asInteger();
- final int firstRow = rawFirstRow == null ?
NumberUtils.toInt(ExcelReader.STARTING_ROW.getDefaultValue()) : rawFirstRow;
- final int zeroBasedFirstRow = ExcelReader.getZeroBasedIndex(firstRow);
- final String password =
context.getProperty(ExcelReader.PASSWORD).getValue();
- final InputFileType inputFileType =
context.getProperty(ExcelReader.INPUT_FILE_TYPE).asAllowableValue(InputFileType.class);
- final ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
- .withRequiredSheets(requiredSheets)
- .withFirstRow(zeroBasedFirstRow)
- .withPassword(password)
- .withInputFileType(inputFileType)
- .build();
-
- final RowIterator rowIterator = new RowIterator(contentStream,
configuration, logger);
+ public RecordSchema inferSchema(RecordSource<Row> recordSource) throws
IOException {
final Map<String, FieldTypeInference> typeMap = new LinkedHashMap<>();
+ final int zeroBasedFirstRow = ExcelReader.getZeroBasedIndex(firstRow);
List<String> fieldNames = null;
int index = 0;
+ Row row;
- while (rowIterator.hasNext()) {
- Row row = rowIterator.next();
+ while ((row = recordSource.next()) != null) {
if (index == 0) {
fieldNames = getFieldNames(firstRow, row);
} else if (row.getRowNum() == zeroBasedFirstRow) { // skip first
row of all sheets
continue;
- } else if (index <= NUM_ROWS_TO_DETERMINE_TYPES) {
- inferSchema(row, fieldNames, typeMap);
} else {
- break;
+ if (RowEvaluationStrategy.STANDARD == rowEvaluationStrategy) {
+ if (index <=
RowEvaluationStrategy.NUM_ROWS_TO_DETERMINE_TYPES) {
+ inferSchema(row, fieldNames, typeMap);
+ } else {
+ break;
+ }
+ } else {
+ inferSchema(row, fieldNames, typeMap);
+ }
}
-
index++;
}
-
- if (typeMap.isEmpty()) {
- final String message = String.format("Failed to infer schema from
empty first %d rows", NUM_ROWS_TO_DETERMINE_TYPES);
- throw new SchemaNotFoundException(message);
- }
return createSchema(typeMap);
}
- private List<String> getFieldNames(int firstRowIndex, Row row) throws
SchemaNotFoundException {
+ private List<String> getFieldNames(int firstRowIndex, Row row) throws
IOException {
if (!ExcelUtils.hasCells(row)) {
- throw new SchemaNotFoundException(String.format("Field names could
not be determined from configured header row %s, as this row has no cells with
data", firstRowIndex));
+ throw new IOException(new
SchemaNotFoundException(String.format("Field names could not be determined from
configured header row %s, as this row has no cells with data", firstRowIndex)));
}
final List<String> fieldNames = new ArrayList<>();
@@ -151,11 +126,12 @@ public class ExcelHeaderSchemaStrategy implements
SchemaAccessStrategy {
return renamedDuplicateFieldNames;
}
- private void inferSchema(final Row row, final List<String> fieldNames,
final Map<String, FieldTypeInference> typeMap) throws SchemaNotFoundException {
+ private void inferSchema(final Row row, final List<String> fieldNames,
final Map<String, FieldTypeInference> typeMap) throws IOException {
// NOTE: This allows rows to be blank when inferring the schema
if (ExcelUtils.hasCells(row)) {
if (row.getLastCellNum() > fieldNames.size()) {
- throw new SchemaNotFoundException(String.format("Row %s has %s
cells, more than the expected %s number of field names", row.getRowNum(),
row.getLastCellNum(), fieldNames.size()));
+ throw new IOException(new
SchemaNotFoundException(String.format("Row %s has %s cells, more than the
expected %s number of field names",
+ row.getRowNum(), row.getLastCellNum(),
fieldNames.size())));
}
IntStream.range(0, row.getLastCellNum())
@@ -167,15 +143,14 @@ public class ExcelHeaderSchemaStrategy implements
SchemaAccessStrategy {
}
}
- private RecordSchema createSchema(final Map<String, FieldTypeInference>
inferences) {
+ private RecordSchema createSchema(final Map<String, FieldTypeInference>
inferences) throws IOException {
+ if (inferences.isEmpty()) {
+ throw new IOException(new SchemaNotFoundException("Failed to infer
schema from empty rows"));
+ }
+
final List<RecordField> recordFields = inferences.entrySet().stream()
.map(entry -> new RecordField(entry.getKey(),
entry.getValue().toDataType(), true))
.collect(Collectors.toList());
return new SimpleRecordSchema(recordFields);
}
-
- @Override
- public Set<SchemaField> getSuppliedSchemaFields() {
- return schemaFields;
- }
}
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowEvaluationStrategy.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowEvaluationStrategy.java
new file mode 100644
index 0000000000..1431a2080b
--- /dev/null
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowEvaluationStrategy.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.excel;
+
+import org.apache.nifi.components.DescribedValue;
+
+public enum RowEvaluationStrategy implements DescribedValue {
+ STANDARD("Standard", "Use %s rows after the starting
row.".formatted(RowEvaluationStrategy.NUM_ROWS_TO_DETERMINE_TYPES)),
+ ALL("All Rows", "Use all the rows after the starting row.");
+
+ static final int NUM_ROWS_TO_DETERMINE_TYPES = 10; // NOTE: This number is
arbitrary.
+
+ private final String displayName;
+ private final String description;
+
+ RowEvaluationStrategy(String displayName, String description) {
+ this.displayName = displayName;
+ this.description = description;
+ }
+
+ @Override
+ public String getValue() {
+ return name();
+ }
+
+ @Override
+ public String getDisplayName() {
+ return displayName;
+ }
+
+ @Override
+ public String getDescription() {
+ return description;
+ }
+}
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelStartingRowSchemaInference.java
similarity index 57%
rename from
nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
rename to
nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelStartingRowSchemaInference.java
index 28fb4dfd8c..00f898a188 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelStartingRowSchemaInference.java
@@ -17,9 +17,10 @@
package org.apache.nifi.excel;
import org.apache.nifi.components.PropertyDescriptor;
-import org.apache.nifi.controller.ConfigurationContext;
+import org.apache.nifi.context.PropertyContext;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.schema.access.SchemaNotFoundException;
+import org.apache.nifi.schema.inference.InferSchemaAccessStrategy;
import org.apache.nifi.schema.inference.TimeValueInference;
import org.apache.nifi.serialization.record.RecordField;
import org.apache.nifi.serialization.record.RecordFieldType;
@@ -32,8 +33,11 @@ import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
@@ -48,19 +52,22 @@ import java.time.LocalDate;
import java.util.List;
import java.util.Map;
-import static java.nio.file.Files.newDirectoryStream;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
+import static java.nio.file.Files.newDirectoryStream;
import static org.junit.jupiter.api.Assertions.assertTrue;
@ExtendWith(MockitoExtension.class)
-public class TestExcelHeaderSchemaStrategy {
+public class TestExcelStartingRowSchemaInference {
private static final TimeValueInference TIME_VALUE_INFERENCE = new
TimeValueInference("MM/dd/yyyy", "HH:mm:ss.SSS", "yyyy/MM/dd/ HH:mm");
@Mock
- ComponentLog logger;
+ private ComponentLog logger;
+
+ private PropertyContext context;
/*
* Cleanup the temporary poifiles directory which is created by
org.apache.poi.util.DefaultTempFileCreationStrategy
@@ -77,75 +84,74 @@ public class TestExcelHeaderSchemaStrategy {
}
}
- @Test
- void testWhereConfiguredStartRowIsEmpty() throws IOException {
- Object[][] singleSheet = {{}, {1, "Manny"}, {2, "Moe"}, {3, "Jack"}};
- final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+ @BeforeEach
+ void setUp() {
final Map<PropertyDescriptor, String> properties = Map.of();
- final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
- final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
+ context = new MockConfigurationContext(properties, null, null);
+ }
+
+ @ParameterizedTest
+ @EnumSource(RowEvaluationStrategy.class)
+ void testWhereConfiguredStartRowIsEmpty(RowEvaluationStrategy
rowEvaluationStrategy) throws IOException {
+ final Object[][] singleSheet = {{}, {1, "Manny"}, {2, "Moe"}, {3,
"Jack"}};
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
- SchemaNotFoundException schemaNotFoundException =
assertThrows(SchemaNotFoundException.class, () ->
schemaStrategy.getSchema(null, inputStream, null));
- assertTrue(schemaNotFoundException.getMessage().contains("no cells
with data"));
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+ final IOException ioException = assertThrows(IOException.class, ()
-> inferSchemaAccessStrategy.getSchema(null, inputStream, null));
+ assertInstanceOf(SchemaNotFoundException.class,
ioException.getCause());
+ assertTrue(ioException.getCause().getMessage().contains("Field
names could not be determined from configured header row"));
}
}
- @Test
- void testWhereConfiguredStartRowHasEmptyCell() throws Exception {
- Object[][] singleSheet = {{"ID", "", "Middle"}, {1, "Manny", "M"}, {2,
"Moe", "M"}, {3, "Jack", "J"}};
+ @ParameterizedTest
+ @EnumSource(RowEvaluationStrategy.class)
+ void testWhereConfiguredStartRowHasEmptyCell(RowEvaluationStrategy
rowEvaluationStrategy) throws Exception {
+ final Object[][] singleSheet = {{"ID", "", "Middle"}, {1, "Manny",
"M"}, {2, "Moe", "M"}, {3, "Jack", "J"}};
final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
- final Map<PropertyDescriptor, String> properties = Map.of();
- final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
- final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
- RecordSchema schema = schemaStrategy.getSchema(null, inputStream,
null);
- RecordField recordField = schema.getField(1);
- assertEquals("column_1", recordField.getFieldName());
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+ RecordSchema schema = inferSchemaAccessStrategy.getSchema(null,
inputStream, null);
+ assertEquals(List.of("ID", "column_1", "Middle"),
schema.getFieldNames());
}
}
- @Test
- void testWhereInferenceRowHasMoreCellsThanFieldNames() throws Exception {
- Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "Manny",
"M"}, {2, "Moe", "M", "Extra"}, {3, "Jack", "J"}};
+ @ParameterizedTest
+ @EnumSource(RowEvaluationStrategy.class)
+ void testWhereInferenceRowHasMoreCellsThanFieldNames(RowEvaluationStrategy
rowEvaluationStrategy) throws Exception {
+ final Object[][] singleSheet = {{"ID", "First", "Middle"}, {1,
"Manny", "M"}, {2, "Moe", "M", "Extra"}, {3, "Jack", "J"}};
final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
- final Map<PropertyDescriptor, String> properties = Map.of();
- final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
- final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
- SchemaNotFoundException schemaNotFoundException =
assertThrows(SchemaNotFoundException.class, () ->
schemaStrategy.getSchema(null, inputStream, null));
- assertTrue(schemaNotFoundException.getMessage().contains("more
than"));
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+ final IOException ioException = assertThrows(IOException.class, ()
-> inferSchemaAccessStrategy.getSchema(null, inputStream, null));
+ assertInstanceOf(SchemaNotFoundException.class,
ioException.getCause());
+ assertTrue(ioException.getCause().getMessage().contains("more
than"));
}
}
@Test
void testWhereTotalRowsLessThanConfiguredInferenceRows() throws Exception {
- Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "Manny",
"M"}, {2, "Moe", "M"}, {3, "Jack", "J"}};
+ final Object[][] singleSheet = {{"ID", "First", "Middle"}, {1,
"Manny", "M"}, {2, "Moe", "M"}, {3, "Jack", "J"}};
final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
- final Map<PropertyDescriptor, String> properties = Map.of();
- final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
- final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
- assertDoesNotThrow(() -> schemaStrategy.getSchema(null,
inputStream, null));
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(RowEvaluationStrategy.STANDARD);
+ assertDoesNotThrow(() -> inferSchemaAccessStrategy.getSchema(null,
inputStream, null));
}
}
@Test
- void testWhereConfiguredInferenceRowsHasAnEmptyRow() throws IOException {
- Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "One", "O"},
{2, "Two", "T"}, {3, "Three", "T"},
+ void testWhereConfiguredInferenceRowsHasAnEmptyRow() throws Exception {
+ final Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "One",
"O"}, {2, "Two", "T"}, {3, "Three", "T"},
{4, "Four", "F"}, {5, "Five", "F"}, {}, {7, "Seven", "S"}, {8,
"Eight", "E"},
{9, "Nine", "N"}, {10, "Ten", "T"}};
-
final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
- final Map<PropertyDescriptor, String> properties = Map.of();
- final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
- final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
- assertDoesNotThrow(() -> schemaStrategy.getSchema(null,
inputStream, null));
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(RowEvaluationStrategy.STANDARD);
+ assertDoesNotThrow(() -> inferSchemaAccessStrategy.getSchema(null,
inputStream, null));
}
}
@@ -156,31 +162,51 @@ public class TestExcelHeaderSchemaStrategy {
{9, "Nine", "N"}, {10, "Ten", "T"}, {11, "Eleven", "E"}};
final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
- final Map<PropertyDescriptor, String> properties = Map.of();
- final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
- final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
- assertDoesNotThrow(() -> schemaStrategy.getSchema(null,
inputStream, null));
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(RowEvaluationStrategy.STANDARD);
+ assertDoesNotThrow(() -> inferSchemaAccessStrategy.getSchema(null,
inputStream, null));
}
}
- @Test
- void testWhereConfiguredInferenceRowsAreAllBlank() throws IOException {
+ @ParameterizedTest
+ @EnumSource(RowEvaluationStrategy.class)
+ void testWhereConfiguredInferenceRowsAreAllBlank(RowEvaluationStrategy
rowEvaluationStrategy) throws Exception {
Object[][] singleSheet = {{"ID", "First", "Middle"}, {}, {}, {}, {},
{}, {}, {}, {}, {}, {}, {11, "Eleven", "E"}};
final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
- final Map<PropertyDescriptor, String> properties = Map.of();
- final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
- final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
- SchemaNotFoundException schemaNotFoundException =
assertThrows(SchemaNotFoundException.class, () ->
schemaStrategy.getSchema(null, inputStream, null));
- assertTrue(schemaNotFoundException.getMessage().contains("empty"));
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+
+ switch (rowEvaluationStrategy) {
+ case STANDARD -> {
+ final IOException ioException =
assertThrows(IOException.class, () -> inferSchemaAccessStrategy.getSchema(null,
inputStream, null));
+ assertInstanceOf(SchemaNotFoundException.class,
ioException.getCause());
+
assertTrue(ioException.getCause().getMessage().contains("empty"));
+ }
+ case ALL -> assertDoesNotThrow(() ->
inferSchemaAccessStrategy.getSchema(null, inputStream, null));
+ }
}
}
- @Test
- void testAlignedDateColumnsAcrossTwoSheets() throws Exception {
+ @ParameterizedTest
+ @EnumSource(RowEvaluationStrategy.class)
+ void testWhereRowsAreAllBlank(RowEvaluationStrategy rowEvaluationStrategy)
throws Exception {
+ Object[][] singleSheet = {{"ID", "First", "Middle"}, {}, {}, {}, {},
{}, {}, {}, {}, {}, {}};
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+
+ try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+
+ final IOException ioException = assertThrows(IOException.class, ()
-> inferSchemaAccessStrategy.getSchema(null, inputStream, null));
+ assertInstanceOf(SchemaNotFoundException.class,
ioException.getCause());
+ assertTrue(ioException.getCause().getMessage().contains("empty"));
+ }
+ }
+
+ @ParameterizedTest
+ @EnumSource(RowEvaluationStrategy.class)
+ void testAlignedDateColumnsAcrossTwoSheets(RowEvaluationStrategy
rowEvaluationStrategy) throws Exception {
final String dateColumnName = "Date";
final Object[] columnNames = {dateColumnName, "Something", "Name"};
final Object[][] firstSheet =
@@ -188,12 +214,11 @@ public class TestExcelHeaderSchemaStrategy {
Object[][] secondSheet =
{columnNames, {LocalDate.of(1976, 9, 11), "test1", "Sheet2"},
{LocalDate.of(1987, 2, 12), "test2", "Sheet2"}};
final ByteArrayOutputStream outputStream = createWorkbook(firstSheet,
secondSheet);
- final Map<PropertyDescriptor, String> properties = Map.of();
- final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
- final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
- final RecordSchema schema = schemaStrategy.getSchema(null,
inputStream, null);
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+
+ final RecordSchema schema =
inferSchemaAccessStrategy.getSchema(null, inputStream, null);
final RecordField dateRecordField =
schema.getField(dateColumnName).orElse(null);
assertNotNull(dateRecordField);
@@ -202,23 +227,28 @@ public class TestExcelHeaderSchemaStrategy {
}
}
- @Test
- void testDuplicateColumnNames() throws Exception {
+ @ParameterizedTest
+ @EnumSource(RowEvaluationStrategy.class)
+ void testDuplicateColumnNames(RowEvaluationStrategy rowEvaluationStrategy)
throws Exception {
Object[][] singleSheet = {{"Frequency", "Intervals", "Frequency",
"Name", "Frequency", "Intervals"},
{6, "0-9", 13, "John", 15, 2}, {4, "10-19", 15, "Sue", 13, 3}};
final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
- final Map<PropertyDescriptor, String> properties = Map.of();
- final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
- final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
-
try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
- RecordSchema recordSchema = schemaStrategy.getSchema(null,
inputStream, null);
- assertEquals(6, recordSchema.getFieldNames().size());
- assertEquals(List.of("Frequency", "Intervals", "Frequency_1",
"Name", "Frequency_2", "Intervals_1"), recordSchema.getFieldNames());
+ final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy =
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+
+ final RecordSchema schema =
inferSchemaAccessStrategy.getSchema(null, inputStream, null);
+ assertEquals(6, schema.getFieldNames().size());
+ assertEquals(List.of("Frequency", "Intervals", "Frequency_1",
"Name", "Frequency_2", "Intervals_1"), schema.getFieldNames());
}
}
+ private InferSchemaAccessStrategy<?>
getInferSchemaAccessStrategy(RowEvaluationStrategy rowEvaluationStrategy) {
+ return new InferSchemaAccessStrategy<>(
+ (variables, content) -> new ExcelRecordSource(content,
context, variables, logger),
+ new ExcelStartingRowSchemaInference(rowEvaluationStrategy, 1,
TIME_VALUE_INFERENCE), logger);
+ }
+
private static ByteArrayOutputStream createWorkbook(Object[][]...
sheetData) throws IOException {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();