This is an automated email from the ASF dual-hosted git repository.
exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new 8f2657bdb0 NIFI-14596 Updated ExcelHeaderSchemaStrategy to append a
number on duplicate columns (#9975)
8f2657bdb0 is described below
commit 8f2657bdb00bc5664f328b82c01a96b7d18f44e1
Author: dan-s1 <[email protected]>
AuthorDate: Sat Jun 7 16:39:15 2025 -0400
NIFI-14596 Updated ExcelHeaderSchemaStrategy to append a number on
duplicate columns (#9975)
- Revised implementation avoids losing data when reading Excel files with
duplicate column names using the ExcelHedaerSchemaStrategy
Signed-off-by: David Handermann <[email protected]>
---
.../nifi/excel/ExcelHeaderSchemaStrategy.java | 27 +++++++++++++++++++---
.../nifi/excel/TestExcelHeaderSchemaStrategy.java | 18 +++++++++++++++
2 files changed, 42 insertions(+), 3 deletions(-)
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
index fc8073adf7..edf80710a7 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
@@ -35,6 +35,7 @@ import org.apache.poi.ss.usermodel.Row;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.EnumSet;
+import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@@ -47,8 +48,10 @@ public class ExcelHeaderSchemaStrategy implements
SchemaAccessStrategy {
static final int NUM_ROWS_TO_DETERMINE_TYPES = 10; // NOTE: This number is
arbitrary.
static final AllowableValue USE_STARTING_ROW = new AllowableValue("Use
Starting Row", "Use Starting Row",
"The configured first row of the Excel file is a header line that
contains the names of the columns. The schema will be derived by using the "
- + "column names in the header of the first sheet and the
following " + NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of
each column " +
- "while the configured header rows of subsequent sheets
are skipped.");
+ + "column names in the header of the first sheet and the
following " + NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of
each column "
+ + "while the configured header rows of subsequent sheets
are skipped. "
+ + "NOTE: If there are duplicate column names then each
subsequent duplicate column name is given a one up number. "
+ + "For example, column names \"Name\", \"Name\" will be
changed to \"Name\", \"Name_1\"");
private final PropertyContext context;
private final ComponentLog logger;
@@ -126,8 +129,26 @@ public class ExcelHeaderSchemaStrategy implements
SchemaAccessStrategy {
fieldNames.add(fieldName);
}
}
+ final List<String> renamedDuplicateFieldNames =
renameDuplicateFieldNames(fieldNames);
- return fieldNames;
+ return renamedDuplicateFieldNames;
+ }
+
+ private List<String> renameDuplicateFieldNames(final List<String>
fieldNames) {
+ final Map<String, Integer> fieldNameCounts = new HashMap<>();
+ final List<String> renamedDuplicateFieldNames = new ArrayList<>();
+
+ for (String fieldName : fieldNames) {
+ if (fieldNameCounts.containsKey(fieldName)) {
+ final int count = fieldNameCounts.get(fieldName);
+ renamedDuplicateFieldNames.add("%s_%d".formatted(fieldName,
count));
+ fieldNameCounts.put(fieldName, count + 1);
+ } else {
+ fieldNameCounts.put(fieldName, 1);
+ renamedDuplicateFieldNames.add(fieldName);
+ }
+ }
+ return renamedDuplicateFieldNames;
}
private void inferSchema(final Row row, final List<String> fieldNames,
final Map<String, FieldTypeInference> typeMap) throws SchemaNotFoundException {
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
index f431bbc4be..28fb4dfd8c 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
@@ -45,6 +45,7 @@ import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDate;
+import java.util.List;
import java.util.Map;
import static java.nio.file.Files.newDirectoryStream;
@@ -201,6 +202,23 @@ public class TestExcelHeaderSchemaStrategy {
}
}
+ @Test
+ void testDuplicateColumnNames() throws Exception {
+ Object[][] singleSheet = {{"Frequency", "Intervals", "Frequency",
"Name", "Frequency", "Intervals"},
+ {6, "0-9", 13, "John", 15, 2}, {4, "10-19", 15, "Sue", 13, 3}};
+
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+ final Map<PropertyDescriptor, String> properties = Map.of();
+ final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
+ final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
+
+ try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
+ RecordSchema recordSchema = schemaStrategy.getSchema(null,
inputStream, null);
+ assertEquals(6, recordSchema.getFieldNames().size());
+ assertEquals(List.of("Frequency", "Intervals", "Frequency_1",
"Name", "Frequency_2", "Intervals_1"), recordSchema.getFieldNames());
+ }
+ }
+
private static ByteArrayOutputStream createWorkbook(Object[][]...
sheetData) throws IOException {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();