(nifi) branch main updated: NIFI-14596 Updated ExcelHeaderSchemaStrategy to append a number on duplicate columns (#9975)

exceptionfactory Sat, 07 Jun 2025 14:16:10 -0700

This is an automated email from the ASF dual-hosted git repository.

exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git



The following commit(s) were added to refs/heads/main by this push:
     new 8f2657bdb0 NIFI-14596 Updated ExcelHeaderSchemaStrategy to append a 
number on duplicate columns (#9975)
8f2657bdb0 is described below

commit 8f2657bdb00bc5664f328b82c01a96b7d18f44e1
Author: dan-s1 <[email protected]>
AuthorDate: Sat Jun 7 16:39:15 2025 -0400

    NIFI-14596 Updated ExcelHeaderSchemaStrategy to append a number on 
duplicate columns (#9975)
    
    - Revised implementation avoids losing data when reading Excel files with 
duplicate column names using the ExcelHedaerSchemaStrategy
    
    Signed-off-by: David Handermann <[email protected]>
---
 .../nifi/excel/ExcelHeaderSchemaStrategy.java      | 27 +++++++++++++++++++---
 .../nifi/excel/TestExcelHeaderSchemaStrategy.java  | 18 +++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
index fc8073adf7..edf80710a7 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
@@ -35,6 +35,7 @@ import org.apache.poi.ss.usermodel.Row;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.EnumSet;
+import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
@@ -47,8 +48,10 @@ public class ExcelHeaderSchemaStrategy implements 
SchemaAccessStrategy {
     static final int NUM_ROWS_TO_DETERMINE_TYPES = 10; // NOTE: This number is 
arbitrary.
     static final AllowableValue USE_STARTING_ROW = new AllowableValue("Use 
Starting Row", "Use Starting Row",
             "The configured first row of the Excel file is a header line that 
contains the names of the columns. The schema will be derived by using the "
-                    + "column names in the header of the first sheet and the 
following " + NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of 
each column " +
-                      "while the configured header rows of subsequent sheets 
are skipped.");
+                    + "column names in the header of the first sheet and the 
following " + NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of 
each column "
+                    + "while the configured header rows of subsequent sheets 
are skipped. "
+                    + "NOTE: If there are duplicate column names then each 
subsequent duplicate column name is given a one up number. "
+                    + "For example, column names \"Name\", \"Name\" will be 
changed to \"Name\", \"Name_1\"");
 
     private final PropertyContext context;
     private final ComponentLog logger;
@@ -126,8 +129,26 @@ public class ExcelHeaderSchemaStrategy implements 
SchemaAccessStrategy {
                 fieldNames.add(fieldName);
             }
         }
+        final List<String> renamedDuplicateFieldNames = 
renameDuplicateFieldNames(fieldNames);
 
-        return fieldNames;
+        return renamedDuplicateFieldNames;
+    }
+
+    private List<String> renameDuplicateFieldNames(final List<String> 
fieldNames) {
+        final Map<String, Integer> fieldNameCounts = new HashMap<>();
+        final List<String> renamedDuplicateFieldNames = new ArrayList<>();
+
+        for (String fieldName : fieldNames) {
+            if (fieldNameCounts.containsKey(fieldName)) {
+                final int count = fieldNameCounts.get(fieldName);
+                renamedDuplicateFieldNames.add("%s_%d".formatted(fieldName, 
count));
+                fieldNameCounts.put(fieldName, count + 1);
+            } else {
+                fieldNameCounts.put(fieldName, 1);
+                renamedDuplicateFieldNames.add(fieldName);
+            }
+        }
+        return renamedDuplicateFieldNames;
     }
 
     private void inferSchema(final Row row, final List<String> fieldNames, 
final Map<String, FieldTypeInference> typeMap) throws SchemaNotFoundException {
diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
index f431bbc4be..28fb4dfd8c 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
@@ -45,6 +45,7 @@ import java.nio.file.DirectoryStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.time.LocalDate;
+import java.util.List;
 import java.util.Map;
 
 import static java.nio.file.Files.newDirectoryStream;
@@ -201,6 +202,23 @@ public class TestExcelHeaderSchemaStrategy {
         }
     }
 
+    @Test
+    void testDuplicateColumnNames() throws Exception {
+        Object[][] singleSheet = {{"Frequency", "Intervals", "Frequency", 
"Name", "Frequency", "Intervals"},
+                {6, "0-9", 13, "John", 15, 2}, {4, "10-19", 15, "Sue", 13, 3}};
+
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+        final Map<PropertyDescriptor, String> properties = Map.of();
+        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
+        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
+
+        try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
+            RecordSchema recordSchema = schemaStrategy.getSchema(null, 
inputStream, null);
+            assertEquals(6, recordSchema.getFieldNames().size());
+            assertEquals(List.of("Frequency", "Intervals", "Frequency_1", 
"Name", "Frequency_2", "Intervals_1"), recordSchema.getFieldNames());
+        }
+    }
+
     private static ByteArrayOutputStream createWorkbook(Object[][]... 
sheetData) throws IOException {
         final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

(nifi) branch main updated: NIFI-14596 Updated ExcelHeaderSchemaStrategy to append a number on duplicate columns (#9975)

Reply via email to