(nifi) branch main updated: NIFI-14579 Added Row Evaluation Strategy property for Excel Schema Inference (#10000)

exceptionfactory Tue, 08 Jul 2025 20:16:29 -0700

This is an automated email from the ASF dual-hosted git repository.

exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git



The following commit(s) were added to refs/heads/main by this push:
     new 3abb62020d NIFI-14579 Added Row Evaluation Strategy property for Excel 
Schema Inference (#10000)
3abb62020d is described below

commit 3abb62020d90de859c3ed6d08e892a34ae5d6c59
Author: dan-s1 <[email protected]>
AuthorDate: Tue Jul 8 23:09:46 2025 -0400

    NIFI-14579 Added Row Evaluation Strategy property for Excel Schema 
Inference (#10000)
    
    - Aligned the Use Starting Row strategy to use a similar approach as the 
Infer Schema strategy
    - Added a Row Evaluation Strategy property for using the starting row as 
the schema field names, with either Standard or All Rows options
    
    Signed-off-by: David Handermann <[email protected]>
---
 .../java/org/apache/nifi/excel/ExcelReader.java    |  47 ++++--
 ...y.java => ExcelStartingRowSchemaInference.java} |  97 +++++-------
 .../apache/nifi/excel/RowEvaluationStrategy.java   |  49 ++++++
 ...va => TestExcelStartingRowSchemaInference.java} | 164 ++++++++++++---------
 4 files changed, 219 insertions(+), 138 deletions(-)

diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
index 1bcb601857..05cd317cc0 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
@@ -95,13 +95,24 @@ public class ExcelReader extends SchemaRegistryService 
implements RecordReaderFa
             .displayName("Starting Row")
             .description("The row number of the first row to start processing 
(One based)."
                     + " Use this to skip over rows of data at the top of a 
worksheet that are not part of the dataset."
-                    + " When using the '" + 
ExcelHeaderSchemaStrategy.USE_STARTING_ROW.getValue() + "' strategy this should 
be the column header row.")
+                    + " When using the '" + 
ExcelStartingRowSchemaInference.USE_STARTING_ROW.getValue() + "' strategy this 
should be the column header row.")
             .required(true)
             .defaultValue("1")
             
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
             .addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR)
             .build();
 
+    public static final PropertyDescriptor ROW_EVALUATION_STRATEGY = new 
PropertyDescriptor
+            .Builder().name("Row Evaluation Strategy")
+            .displayName("Row Evaluation Strategy")
+            .description("A strategy to select how many rows after the 
starting row to use for determining the schema.")
+            .required(true)
+            .allowableValues(RowEvaluationStrategy.class)
+            .defaultValue(RowEvaluationStrategy.STANDARD)
+            .dependsOn(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, 
ExcelStartingRowSchemaInference.USE_STARTING_ROW)
+            .addValidator(StandardValidators.NON_BLANK_VALIDATOR)
+            .build();
+
     public static final PropertyDescriptor REQUIRED_SHEETS = new 
PropertyDescriptor
             .Builder().name("Required Sheets")
             .displayName("Required Sheets")
@@ -159,6 +170,7 @@ public class ExcelReader extends SchemaRegistryService 
implements RecordReaderFa
         properties.add(PROTECTION_TYPE);
         properties.add(PASSWORD);
         properties.add(STARTING_ROW);
+        properties.add(ROW_EVALUATION_STRATEGY);
         properties.add(REQUIRED_SHEETS);
         properties.add(DateTimeUtils.DATE_FORMAT);
         properties.add(DateTimeUtils.TIME_FORMAT);
@@ -169,12 +181,18 @@ public class ExcelReader extends SchemaRegistryService 
implements RecordReaderFa
 
     @Override
     protected SchemaAccessStrategy getSchemaAccessStrategy(final String 
allowableValue, final SchemaRegistry schemaRegistry, final PropertyContext 
context) {
-        if 
(allowableValue.equalsIgnoreCase(ExcelHeaderSchemaStrategy.USE_STARTING_ROW.getValue()))
 {
-            return new ExcelHeaderSchemaStrategy(context, getLogger(), new 
TimeValueInference(dateFormat, timeFormat, timestampFormat));
+        if 
(ExcelStartingRowSchemaInference.USE_STARTING_ROW.getValue().equals(allowableValue))
 {
+            final RowEvaluationStrategy rowEvaluationStrategy =
+                    
context.getProperty(ROW_EVALUATION_STRATEGY).asAllowableValue(RowEvaluationStrategy.class);
+            final int firstRow = context.getProperty(STARTING_ROW)
+                    .evaluateAttributeExpressions()
+                    .asInteger();
+            final SchemaInferenceEngine<Row> inference =
+                    new ExcelStartingRowSchemaInference(rowEvaluationStrategy, 
firstRow, createTimeValueInference());
+            return createInferSchemaAccessStrategy(context, inference);
         } else if 
(SchemaInferenceUtil.INFER_SCHEMA.getValue().equals(allowableValue)) {
-            final RecordSourceFactory<Row> sourceFactory = (variables, in) -> 
new ExcelRecordSource(in, context, variables, getLogger());
-            final SchemaInferenceEngine<Row> inference = new 
ExcelSchemaInference(new TimeValueInference(dateFormat, timeFormat, 
timestampFormat));
-            return new InferSchemaAccessStrategy<>(sourceFactory, inference, 
getLogger());
+            final SchemaInferenceEngine<Row> inference = new 
ExcelSchemaInference(createTimeValueInference());
+            return createInferSchemaAccessStrategy(context, inference);
         }
 
         return super.getSchemaAccessStrategy(allowableValue, schemaRegistry, 
context);
@@ -183,21 +201,21 @@ public class ExcelReader extends SchemaRegistryService 
implements RecordReaderFa
     @Override
     protected List<AllowableValue> getSchemaAccessStrategyValues() {
         final List<AllowableValue> allowableValues = new 
ArrayList<>(super.getSchemaAccessStrategyValues());
-        allowableValues.add(ExcelHeaderSchemaStrategy.USE_STARTING_ROW);
+        allowableValues.add(ExcelStartingRowSchemaInference.USE_STARTING_ROW);
         allowableValues.add(SchemaInferenceUtil.INFER_SCHEMA);
         return allowableValues;
     }
 
     @Override
     protected AllowableValue getDefaultSchemaAccessStrategy() {
-        return ExcelHeaderSchemaStrategy.USE_STARTING_ROW;
+        return ExcelStartingRowSchemaInference.USE_STARTING_ROW;
     }
 
     private int getStartingRow(final Map<String, String> variables) {
         int rawStartingRow = 
configurationContext.getProperty(STARTING_ROW).evaluateAttributeExpressions(variables).asInteger();
-        String schemaAccessStrategy = 
configurationContext.getProperty(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY).getValue();
+        final String schemaAccessStrategy = 
configurationContext.getProperty(SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY).getValue();
 
-        if 
(ExcelHeaderSchemaStrategy.USE_STARTING_ROW.getValue().equals(schemaAccessStrategy))
 {
+        if 
(ExcelStartingRowSchemaInference.USE_STARTING_ROW.getValue().equals(schemaAccessStrategy))
 {
             rawStartingRow++;
         }
         return getZeroBasedIndex(rawStartingRow);
@@ -222,4 +240,13 @@ public class ExcelReader extends SchemaRegistryService 
implements RecordReaderFa
 
         return Collections.emptyList();
     }
+
+    private TimeValueInference createTimeValueInference() {
+        return new TimeValueInference(dateFormat, timeFormat, timestampFormat);
+    }
+
+    private InferSchemaAccessStrategy<Row> 
createInferSchemaAccessStrategy(final PropertyContext context, final 
SchemaInferenceEngine<Row> inference) {
+        final RecordSourceFactory<Row> sourceFactory = (variables, in) -> new 
ExcelRecordSource(in, context, variables, getLogger());
+        return new InferSchemaAccessStrategy<>(sourceFactory, inference, 
getLogger());
+    }
 }
diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelStartingRowSchemaInference.java
similarity index 60%
rename from 
nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
rename to 
nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelStartingRowSchemaInference.java
index edf80710a7..4a6c0f4544 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelStartingRowSchemaInference.java
@@ -16,14 +16,11 @@
  */
 package org.apache.nifi.excel;
 
-import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.nifi.components.AllowableValue;
-import org.apache.nifi.context.PropertyContext;
-import org.apache.nifi.logging.ComponentLog;
-import org.apache.nifi.schema.access.SchemaAccessStrategy;
-import org.apache.nifi.schema.access.SchemaField;
 import org.apache.nifi.schema.access.SchemaNotFoundException;
 import org.apache.nifi.schema.inference.FieldTypeInference;
+import org.apache.nifi.schema.inference.RecordSource;
+import org.apache.nifi.schema.inference.SchemaInferenceEngine;
 import org.apache.nifi.schema.inference.TimeValueInference;
 import org.apache.nifi.serialization.SimpleRecordSchema;
 import org.apache.nifi.serialization.record.RecordField;
@@ -32,89 +29,67 @@ import org.apache.poi.ss.usermodel.Cell;
 import org.apache.poi.ss.usermodel.DataFormatter;
 import org.apache.poi.ss.usermodel.Row;
 
-import java.io.InputStream;
+import java.io.IOException;
 import java.util.ArrayList;
-import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
-public class ExcelHeaderSchemaStrategy implements SchemaAccessStrategy {
-    private static final Set<SchemaField> schemaFields = 
EnumSet.noneOf(SchemaField.class);
-    static final int NUM_ROWS_TO_DETERMINE_TYPES = 10; // NOTE: This number is 
arbitrary.
+public class ExcelStartingRowSchemaInference implements 
SchemaInferenceEngine<Row> {
+
     static final AllowableValue USE_STARTING_ROW = new AllowableValue("Use 
Starting Row", "Use Starting Row",
             "The configured first row of the Excel file is a header line that 
contains the names of the columns. The schema will be derived by using the "
-                    + "column names in the header of the first sheet and the 
following " + NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of 
each column "
-                    + "while the configured header rows of subsequent sheets 
are skipped. "
+                    + "column names in the header of the first sheet and 
dependent on the strategy chosen either the subsequent "
+                    + RowEvaluationStrategy.NUM_ROWS_TO_DETERMINE_TYPES + " 
rows or all of the subsequent rows. However the configured header rows of 
subsequent sheets are skipped. "
                     + "NOTE: If there are duplicate column names then each 
subsequent duplicate column name is given a one up number. "
-                    + "For example, column names \"Name\", \"Name\" will be 
changed to \"Name\", \"Name_1\"");
-
-    private final PropertyContext context;
-    private final ComponentLog logger;
+                    + "For example, column names \"Name\", \"Name\" will be 
changed to \"Name\", \"Name_1\".");
+    private final RowEvaluationStrategy rowEvaluationStrategy;
+    private final int firstRow;
     private final CellFieldTypeReader cellFieldTypeReader;
     private final DataFormatter dataFormatter;
 
-    public ExcelHeaderSchemaStrategy(PropertyContext context, ComponentLog 
logger, TimeValueInference timeValueInference) {
-        this.context = context;
-        this.logger = logger;
+    public ExcelStartingRowSchemaInference(RowEvaluationStrategy 
rowEvaluationStrategy, int firstRow, TimeValueInference timeValueInference) {
+        this.rowEvaluationStrategy = rowEvaluationStrategy;
+        this.firstRow = firstRow;
         this.cellFieldTypeReader = new 
StandardCellFieldTypeReader(timeValueInference);
         this.dataFormatter = new DataFormatter();
     }
 
     @Override
-    public RecordSchema getSchema(Map<String, String> variables, InputStream 
contentStream, RecordSchema readSchema) throws SchemaNotFoundException {
-        if (this.context == null) {
-            throw new SchemaNotFoundException("Schema Access Strategy intended 
only for validation purposes and cannot obtain schema");
-        }
-
-        final String requiredSheetsDelimited = 
context.getProperty(ExcelReader.REQUIRED_SHEETS).evaluateAttributeExpressions(variables).getValue();
-        final List<String> requiredSheets = 
ExcelReader.getRequiredSheets(requiredSheetsDelimited);
-        final Integer rawFirstRow = 
context.getProperty(ExcelReader.STARTING_ROW).evaluateAttributeExpressions(variables).asInteger();
-        final int firstRow = rawFirstRow == null ? 
NumberUtils.toInt(ExcelReader.STARTING_ROW.getDefaultValue()) : rawFirstRow;
-        final int zeroBasedFirstRow = ExcelReader.getZeroBasedIndex(firstRow);
-        final String password = 
context.getProperty(ExcelReader.PASSWORD).getValue();
-        final InputFileType inputFileType = 
context.getProperty(ExcelReader.INPUT_FILE_TYPE).asAllowableValue(InputFileType.class);
-        final ExcelRecordReaderConfiguration configuration = new 
ExcelRecordReaderConfiguration.Builder()
-                .withRequiredSheets(requiredSheets)
-                .withFirstRow(zeroBasedFirstRow)
-                .withPassword(password)
-                .withInputFileType(inputFileType)
-                .build();
-
-        final RowIterator rowIterator = new RowIterator(contentStream, 
configuration, logger);
+    public RecordSchema inferSchema(RecordSource<Row> recordSource) throws 
IOException {
         final Map<String, FieldTypeInference> typeMap = new LinkedHashMap<>();
+        final int zeroBasedFirstRow = ExcelReader.getZeroBasedIndex(firstRow);
         List<String> fieldNames = null;
         int index = 0;
+        Row row;
 
-        while (rowIterator.hasNext()) {
-            Row row = rowIterator.next();
+        while ((row = recordSource.next()) != null) {
             if (index == 0) {
                 fieldNames = getFieldNames(firstRow, row);
             } else if (row.getRowNum() == zeroBasedFirstRow) { // skip first 
row of all sheets
                 continue;
-            } else if (index <= NUM_ROWS_TO_DETERMINE_TYPES) {
-                inferSchema(row, fieldNames, typeMap);
             } else {
-                break;
+                if (RowEvaluationStrategy.STANDARD == rowEvaluationStrategy) {
+                    if (index <= 
RowEvaluationStrategy.NUM_ROWS_TO_DETERMINE_TYPES) {
+                        inferSchema(row, fieldNames, typeMap);
+                    } else {
+                        break;
+                    }
+                } else {
+                    inferSchema(row, fieldNames, typeMap);
+                }
             }
-
             index++;
         }
-
-        if (typeMap.isEmpty()) {
-            final String message = String.format("Failed to infer schema from 
empty first %d rows", NUM_ROWS_TO_DETERMINE_TYPES);
-            throw new SchemaNotFoundException(message);
-        }
         return createSchema(typeMap);
     }
 
-    private List<String> getFieldNames(int firstRowIndex, Row row) throws 
SchemaNotFoundException {
+    private List<String> getFieldNames(int firstRowIndex, Row row) throws 
IOException {
         if (!ExcelUtils.hasCells(row)) {
-            throw new SchemaNotFoundException(String.format("Field names could 
not be determined from configured header row %s, as this row has no cells with 
data", firstRowIndex));
+            throw new IOException(new 
SchemaNotFoundException(String.format("Field names could not be determined from 
configured header row %s, as this row has no cells with data", firstRowIndex)));
         }
 
         final List<String> fieldNames = new ArrayList<>();
@@ -151,11 +126,12 @@ public class ExcelHeaderSchemaStrategy implements 
SchemaAccessStrategy {
         return renamedDuplicateFieldNames;
     }
 
-    private void inferSchema(final Row row, final List<String> fieldNames, 
final Map<String, FieldTypeInference> typeMap) throws SchemaNotFoundException {
+    private void inferSchema(final Row row, final List<String> fieldNames, 
final Map<String, FieldTypeInference> typeMap) throws IOException {
         // NOTE: This allows rows to be blank when inferring the schema
         if (ExcelUtils.hasCells(row)) {
             if (row.getLastCellNum() > fieldNames.size()) {
-                throw new SchemaNotFoundException(String.format("Row %s has %s 
cells, more than the expected %s number of field names", row.getRowNum(), 
row.getLastCellNum(), fieldNames.size()));
+                throw new IOException(new 
SchemaNotFoundException(String.format("Row %s has %s cells, more than the 
expected %s number of field names",
+                        row.getRowNum(), row.getLastCellNum(), 
fieldNames.size())));
             }
 
             IntStream.range(0, row.getLastCellNum())
@@ -167,15 +143,14 @@ public class ExcelHeaderSchemaStrategy implements 
SchemaAccessStrategy {
         }
     }
 
-    private RecordSchema createSchema(final Map<String, FieldTypeInference> 
inferences) {
+    private RecordSchema createSchema(final Map<String, FieldTypeInference> 
inferences) throws IOException {
+        if (inferences.isEmpty()) {
+            throw new IOException(new SchemaNotFoundException("Failed to infer 
schema from empty rows"));
+        }
+
         final List<RecordField> recordFields = inferences.entrySet().stream()
                 .map(entry -> new RecordField(entry.getKey(), 
entry.getValue().toDataType(), true))
                 .collect(Collectors.toList());
         return new SimpleRecordSchema(recordFields);
     }
-
-    @Override
-    public Set<SchemaField> getSuppliedSchemaFields() {
-        return schemaFields;
-    }
 }
diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowEvaluationStrategy.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowEvaluationStrategy.java
new file mode 100644
index 0000000000..1431a2080b
--- /dev/null
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowEvaluationStrategy.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.excel;
+
+import org.apache.nifi.components.DescribedValue;
+
+public enum RowEvaluationStrategy implements DescribedValue {
+    STANDARD("Standard", "Use %s rows after the starting 
row.".formatted(RowEvaluationStrategy.NUM_ROWS_TO_DETERMINE_TYPES)),
+    ALL("All Rows", "Use all the rows after the starting row.");
+
+    static final int NUM_ROWS_TO_DETERMINE_TYPES = 10; // NOTE: This number is 
arbitrary.
+
+    private final String displayName;
+    private final String description;
+
+    RowEvaluationStrategy(String displayName, String description) {
+        this.displayName = displayName;
+        this.description = description;
+    }
+
+    @Override
+    public String getValue() {
+        return name();
+    }
+
+    @Override
+    public String getDisplayName() {
+        return displayName;
+    }
+
+    @Override
+    public String getDescription() {
+        return description;
+    }
+}
diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelStartingRowSchemaInference.java
similarity index 57%
rename from 
nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
rename to 
nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelStartingRowSchemaInference.java
index 28fb4dfd8c..00f898a188 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelStartingRowSchemaInference.java
@@ -17,9 +17,10 @@
 package org.apache.nifi.excel;
 
 import org.apache.nifi.components.PropertyDescriptor;
-import org.apache.nifi.controller.ConfigurationContext;
+import org.apache.nifi.context.PropertyContext;
 import org.apache.nifi.logging.ComponentLog;
 import org.apache.nifi.schema.access.SchemaNotFoundException;
+import org.apache.nifi.schema.inference.InferSchemaAccessStrategy;
 import org.apache.nifi.schema.inference.TimeValueInference;
 import org.apache.nifi.serialization.record.RecordField;
 import org.apache.nifi.serialization.record.RecordFieldType;
@@ -32,8 +33,11 @@ import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.xssf.usermodel.XSSFSheet;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.ExtendWith;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
 import org.mockito.Mock;
 import org.mockito.junit.jupiter.MockitoExtension;
 
@@ -48,19 +52,22 @@ import java.time.LocalDate;
 import java.util.List;
 import java.util.Map;
 
-import static java.nio.file.Files.newDirectoryStream;
 import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertInstanceOf;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
+import static java.nio.file.Files.newDirectoryStream;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 @ExtendWith(MockitoExtension.class)
-public class TestExcelHeaderSchemaStrategy {
+public class TestExcelStartingRowSchemaInference {
     private static final TimeValueInference TIME_VALUE_INFERENCE = new 
TimeValueInference("MM/dd/yyyy", "HH:mm:ss.SSS", "yyyy/MM/dd/ HH:mm");
 
     @Mock
-    ComponentLog logger;
+    private ComponentLog logger;
+
+    private PropertyContext context;
 
     /*
      * Cleanup the temporary poifiles directory which is created by 
org.apache.poi.util.DefaultTempFileCreationStrategy
@@ -77,75 +84,74 @@ public class TestExcelHeaderSchemaStrategy {
         }
     }
 
-    @Test
-    void testWhereConfiguredStartRowIsEmpty() throws IOException {
-        Object[][] singleSheet = {{}, {1, "Manny"}, {2, "Moe"}, {3, "Jack"}};
-        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+    @BeforeEach
+    void setUp() {
         final Map<PropertyDescriptor, String> properties = Map.of();
-        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
-        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
+        context = new MockConfigurationContext(properties, null, null);
+    }
+
+    @ParameterizedTest
+    @EnumSource(RowEvaluationStrategy.class)
+    void testWhereConfiguredStartRowIsEmpty(RowEvaluationStrategy 
rowEvaluationStrategy) throws IOException {
+        final Object[][] singleSheet = {{}, {1, "Manny"}, {2, "Moe"}, {3, 
"Jack"}};
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
 
         try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
-            SchemaNotFoundException schemaNotFoundException = 
assertThrows(SchemaNotFoundException.class, () -> 
schemaStrategy.getSchema(null, inputStream, null));
-            assertTrue(schemaNotFoundException.getMessage().contains("no cells 
with data"));
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+            final IOException ioException = assertThrows(IOException.class, () 
-> inferSchemaAccessStrategy.getSchema(null, inputStream, null));
+            assertInstanceOf(SchemaNotFoundException.class, 
ioException.getCause());
+            assertTrue(ioException.getCause().getMessage().contains("Field 
names could not be determined from configured header row"));
         }
     }
 
-    @Test
-    void testWhereConfiguredStartRowHasEmptyCell() throws Exception {
-        Object[][] singleSheet = {{"ID", "", "Middle"}, {1, "Manny", "M"}, {2, 
"Moe", "M"}, {3, "Jack", "J"}};
+    @ParameterizedTest
+    @EnumSource(RowEvaluationStrategy.class)
+    void testWhereConfiguredStartRowHasEmptyCell(RowEvaluationStrategy 
rowEvaluationStrategy) throws Exception {
+        final Object[][] singleSheet = {{"ID", "", "Middle"}, {1, "Manny", 
"M"}, {2, "Moe", "M"}, {3, "Jack", "J"}};
         final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
-        final Map<PropertyDescriptor, String> properties = Map.of();
-        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
-        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
         try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
-            RecordSchema schema = schemaStrategy.getSchema(null, inputStream, 
null);
-            RecordField recordField = schema.getField(1);
-            assertEquals("column_1", recordField.getFieldName());
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+            RecordSchema schema = inferSchemaAccessStrategy.getSchema(null, 
inputStream, null);
+            assertEquals(List.of("ID", "column_1", "Middle"), 
schema.getFieldNames());
         }
     }
 
-    @Test
-    void testWhereInferenceRowHasMoreCellsThanFieldNames() throws Exception {
-        Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "Manny", 
"M"}, {2, "Moe", "M", "Extra"}, {3, "Jack", "J"}};
+    @ParameterizedTest
+    @EnumSource(RowEvaluationStrategy.class)
+    void testWhereInferenceRowHasMoreCellsThanFieldNames(RowEvaluationStrategy 
rowEvaluationStrategy) throws Exception {
+        final Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, 
"Manny", "M"}, {2, "Moe", "M", "Extra"}, {3, "Jack", "J"}};
         final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
-        final Map<PropertyDescriptor, String> properties = Map.of();
-        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
-        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
         try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
-            SchemaNotFoundException schemaNotFoundException = 
assertThrows(SchemaNotFoundException.class, () -> 
schemaStrategy.getSchema(null, inputStream, null));
-            assertTrue(schemaNotFoundException.getMessage().contains("more 
than"));
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+            final IOException ioException = assertThrows(IOException.class, () 
-> inferSchemaAccessStrategy.getSchema(null, inputStream, null));
+            assertInstanceOf(SchemaNotFoundException.class, 
ioException.getCause());
+            assertTrue(ioException.getCause().getMessage().contains("more 
than"));
         }
     }
 
     @Test
     void testWhereTotalRowsLessThanConfiguredInferenceRows() throws Exception {
-        Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "Manny", 
"M"}, {2, "Moe", "M"}, {3, "Jack", "J"}};
+        final Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, 
"Manny", "M"}, {2, "Moe", "M"}, {3, "Jack", "J"}};
         final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
-        final Map<PropertyDescriptor, String> properties = Map.of();
-        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
-        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
         try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
-            assertDoesNotThrow(() -> schemaStrategy.getSchema(null, 
inputStream, null));
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(RowEvaluationStrategy.STANDARD);
+            assertDoesNotThrow(() -> inferSchemaAccessStrategy.getSchema(null, 
inputStream, null));
         }
     }
 
     @Test
-    void testWhereConfiguredInferenceRowsHasAnEmptyRow() throws IOException {
-        Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "One", "O"}, 
{2, "Two", "T"}, {3, "Three", "T"},
+    void testWhereConfiguredInferenceRowsHasAnEmptyRow() throws Exception {
+        final Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "One", 
"O"}, {2, "Two", "T"}, {3, "Three", "T"},
                 {4, "Four", "F"}, {5, "Five", "F"}, {}, {7, "Seven", "S"}, {8, 
"Eight", "E"},
                 {9, "Nine", "N"}, {10, "Ten", "T"}};
-
         final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
-        final Map<PropertyDescriptor, String> properties = Map.of();
-        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
-        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
         try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
-            assertDoesNotThrow(() -> schemaStrategy.getSchema(null, 
inputStream, null));
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(RowEvaluationStrategy.STANDARD);
+            assertDoesNotThrow(() -> inferSchemaAccessStrategy.getSchema(null, 
inputStream, null));
         }
     }
 
@@ -156,31 +162,51 @@ public class TestExcelHeaderSchemaStrategy {
                 {9, "Nine", "N"}, {10, "Ten", "T"}, {11, "Eleven", "E"}};
 
         final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
-        final Map<PropertyDescriptor, String> properties = Map.of();
-        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
-        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
         try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
-            assertDoesNotThrow(() -> schemaStrategy.getSchema(null, 
inputStream, null));
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(RowEvaluationStrategy.STANDARD);
+            assertDoesNotThrow(() -> inferSchemaAccessStrategy.getSchema(null, 
inputStream, null));
         }
     }
 
-    @Test
-    void testWhereConfiguredInferenceRowsAreAllBlank() throws IOException {
+    @ParameterizedTest
+    @EnumSource(RowEvaluationStrategy.class)
+    void testWhereConfiguredInferenceRowsAreAllBlank(RowEvaluationStrategy 
rowEvaluationStrategy) throws Exception {
         Object[][] singleSheet = {{"ID", "First", "Middle"}, {}, {}, {}, {}, 
{}, {}, {}, {}, {}, {}, {11, "Eleven", "E"}};
         final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
-        final Map<PropertyDescriptor, String> properties = Map.of();
-        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
-        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
         try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
-            SchemaNotFoundException schemaNotFoundException = 
assertThrows(SchemaNotFoundException.class, () -> 
schemaStrategy.getSchema(null, inputStream, null));
-            assertTrue(schemaNotFoundException.getMessage().contains("empty"));
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+
+            switch (rowEvaluationStrategy) {
+                case STANDARD -> {
+                    final IOException ioException = 
assertThrows(IOException.class, () -> inferSchemaAccessStrategy.getSchema(null, 
inputStream, null));
+                    assertInstanceOf(SchemaNotFoundException.class, 
ioException.getCause());
+                    
assertTrue(ioException.getCause().getMessage().contains("empty"));
+                }
+                case ALL -> assertDoesNotThrow(() -> 
inferSchemaAccessStrategy.getSchema(null, inputStream, null));
+            }
         }
     }
 
-    @Test
-    void testAlignedDateColumnsAcrossTwoSheets() throws Exception {
+    @ParameterizedTest
+    @EnumSource(RowEvaluationStrategy.class)
+    void testWhereRowsAreAllBlank(RowEvaluationStrategy rowEvaluationStrategy) 
throws Exception {
+        Object[][] singleSheet = {{"ID", "First", "Middle"}, {}, {}, {}, {}, 
{}, {}, {}, {}, {}, {}};
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+
+        try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+
+            final IOException ioException = assertThrows(IOException.class, () 
-> inferSchemaAccessStrategy.getSchema(null, inputStream, null));
+            assertInstanceOf(SchemaNotFoundException.class, 
ioException.getCause());
+            assertTrue(ioException.getCause().getMessage().contains("empty"));
+        }
+    }
+
+    @ParameterizedTest
+    @EnumSource(RowEvaluationStrategy.class)
+    void testAlignedDateColumnsAcrossTwoSheets(RowEvaluationStrategy 
rowEvaluationStrategy) throws Exception {
         final String dateColumnName = "Date";
         final Object[] columnNames = {dateColumnName, "Something", "Name"};
         final Object[][] firstSheet =
@@ -188,12 +214,11 @@ public class TestExcelHeaderSchemaStrategy {
         Object[][] secondSheet =
                 {columnNames, {LocalDate.of(1976, 9, 11), "test1", "Sheet2"}, 
{LocalDate.of(1987, 2, 12), "test2", "Sheet2"}};
         final ByteArrayOutputStream outputStream = createWorkbook(firstSheet, 
secondSheet);
-        final Map<PropertyDescriptor, String> properties = Map.of();
-        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
-        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
         try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
-            final RecordSchema schema = schemaStrategy.getSchema(null, 
inputStream, null);
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+
+            final RecordSchema schema = 
inferSchemaAccessStrategy.getSchema(null, inputStream, null);
             final RecordField dateRecordField = 
schema.getField(dateColumnName).orElse(null);
 
             assertNotNull(dateRecordField);
@@ -202,23 +227,28 @@ public class TestExcelHeaderSchemaStrategy {
         }
     }
 
-    @Test
-    void testDuplicateColumnNames() throws Exception {
+    @ParameterizedTest
+    @EnumSource(RowEvaluationStrategy.class)
+    void testDuplicateColumnNames(RowEvaluationStrategy rowEvaluationStrategy) 
throws Exception {
         Object[][] singleSheet = {{"Frequency", "Intervals", "Frequency", 
"Name", "Frequency", "Intervals"},
                 {6, "0-9", 13, "John", 15, 2}, {4, "10-19", 15, "Sue", 13, 3}};
 
         final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
-        final Map<PropertyDescriptor, String> properties = Map.of();
-        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
-        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
-
         try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
-            RecordSchema recordSchema = schemaStrategy.getSchema(null, 
inputStream, null);
-            assertEquals(6, recordSchema.getFieldNames().size());
-            assertEquals(List.of("Frequency", "Intervals", "Frequency_1", 
"Name", "Frequency_2", "Intervals_1"), recordSchema.getFieldNames());
+            final InferSchemaAccessStrategy<?> inferSchemaAccessStrategy = 
getInferSchemaAccessStrategy(rowEvaluationStrategy);
+
+            final RecordSchema schema = 
inferSchemaAccessStrategy.getSchema(null, inputStream, null);
+            assertEquals(6, schema.getFieldNames().size());
+            assertEquals(List.of("Frequency", "Intervals", "Frequency_1", 
"Name", "Frequency_2", "Intervals_1"), schema.getFieldNames());
         }
     }
 
+    private InferSchemaAccessStrategy<?> 
getInferSchemaAccessStrategy(RowEvaluationStrategy rowEvaluationStrategy) {
+        return new InferSchemaAccessStrategy<>(
+                (variables, content) -> new ExcelRecordSource(content, 
context, variables, logger),
+                new ExcelStartingRowSchemaInference(rowEvaluationStrategy, 1, 
TIME_VALUE_INFERENCE), logger);
+    }
+
     private static ByteArrayOutputStream createWorkbook(Object[][]... 
sheetData) throws IOException {
         final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

(nifi) branch main updated: NIFI-14579 Added Row Evaluation Strategy property for Excel Schema Inference (#10000)

Reply via email to