(nifi) branch main updated: NIFI-14265 Ensured aligned date columns across multiple sheets when using the Use Starting Row strategy are inferred only as dates.

chriss Fri, 28 Feb 2025 12:46:50 -0800

This is an automated email from the ASF dual-hosted git repository.

chriss pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git



The following commit(s) were added to refs/heads/main by this push:
     new c96b75076e NIFI-14265 Ensured aligned date columns across multiple 
sheets when using the Use Starting Row strategy are inferred only as dates.
c96b75076e is described below

commit c96b75076e8d672f3c69c0f7793aaf7b13694857
Author: dan-s1 <[email protected]>
AuthorDate: Wed Feb 26 16:59:32 2025 +0000

    NIFI-14265 Ensured aligned date columns across multiple sheets when using 
the Use Starting Row strategy are inferred only as dates.
    
    Signed-off-by: Chris Sampson <[email protected]>
    
    This closes #9750.
---
 .../nifi/excel/ExcelHeaderSchemaStrategy.java      |  25 ++---
 .../nifi/excel/TestExcelHeaderSchemaStrategy.java  | 110 ++++++++++++++-------
 2 files changed, 90 insertions(+), 45 deletions(-)

diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
index e77ec9803f..5db05490d2 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
@@ -47,7 +47,8 @@ public class ExcelHeaderSchemaStrategy implements 
SchemaAccessStrategy {
     static final int NUM_ROWS_TO_DETERMINE_TYPES = 10; // NOTE: This number is 
arbitrary.
     static final AllowableValue USE_STARTING_ROW = new AllowableValue("Use 
Starting Row", "Use Starting Row",
             "The configured first row of the Excel file is a header line that 
contains the names of the columns. The schema will be derived by using the "
-                    + "column names in the header and the following " + 
NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of each column");
+                    + "column names in the header of the first sheet and the 
following " + NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of 
each column " +
+                      "while the configured header rows of subsequent sheets 
are skipped.");
 
     private final PropertyContext context;
     private final ComponentLog logger;
@@ -85,16 +86,18 @@ public class ExcelHeaderSchemaStrategy implements 
SchemaAccessStrategy {
         int index = 0;
 
         while (rowIterator.hasNext()) {
-           Row row = rowIterator.next();
-           if (index == 0) {
-               fieldNames = getFieldNames(firstRow, row);
-           } else if (index <= NUM_ROWS_TO_DETERMINE_TYPES) {
-               inferSchema(row, fieldNames, typeMap);
-           } else {
-               break;
-           }
-
-           index++;
+            Row row = rowIterator.next();
+            if (index == 0) {
+                fieldNames = getFieldNames(firstRow, row);
+            } else if (row.getRowNum() == zeroBasedFirstRow) { // skip first 
row of all sheets
+                continue;
+            } else if (index <= NUM_ROWS_TO_DETERMINE_TYPES) {
+                inferSchema(row, fieldNames, typeMap);
+            } else {
+                break;
+            }
+
+            index++;
         }
 
         if (typeMap.isEmpty()) {
diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
index 71a16a3aea..f431bbc4be 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
@@ -22,9 +22,12 @@ import org.apache.nifi.logging.ComponentLog;
 import org.apache.nifi.schema.access.SchemaNotFoundException;
 import org.apache.nifi.schema.inference.TimeValueInference;
 import org.apache.nifi.serialization.record.RecordField;
+import org.apache.nifi.serialization.record.RecordFieldType;
 import org.apache.nifi.serialization.record.RecordSchema;
 import org.apache.nifi.util.MockConfigurationContext;
 import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.ss.usermodel.CreationHelper;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.xssf.usermodel.XSSFSheet;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
@@ -41,12 +44,13 @@ import java.io.InputStream;
 import java.nio.file.DirectoryStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.HashMap;
+import java.time.LocalDate;
 import java.util.Map;
 
 import static java.nio.file.Files.newDirectoryStream;
 import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
@@ -74,9 +78,9 @@ public class TestExcelHeaderSchemaStrategy {
 
     @Test
     void testWhereConfiguredStartRowIsEmpty() throws IOException {
-        Object[][] data = {{}, {1, "Manny"}, {2, "Moe"}, {3, "Jack"}};
-        final ByteArrayOutputStream outputStream = 
getSingleSheetWorkbook(data);
-        final Map<PropertyDescriptor, String> properties = new HashMap<>();
+        Object[][] singleSheet = {{}, {1, "Manny"}, {2, "Moe"}, {3, "Jack"}};
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+        final Map<PropertyDescriptor, String> properties = Map.of();
         final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
         final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
@@ -88,9 +92,9 @@ public class TestExcelHeaderSchemaStrategy {
 
     @Test
     void testWhereConfiguredStartRowHasEmptyCell() throws Exception {
-        Object[][] data = {{"ID", "", "Middle"}, {1, "Manny", "M"}, {2, "Moe", 
"M"}, {3, "Jack", "J"}};
-        final ByteArrayOutputStream outputStream = 
getSingleSheetWorkbook(data);
-        final Map<PropertyDescriptor, String> properties = new HashMap<>();
+        Object[][] singleSheet = {{"ID", "", "Middle"}, {1, "Manny", "M"}, {2, 
"Moe", "M"}, {3, "Jack", "J"}};
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+        final Map<PropertyDescriptor, String> properties = Map.of();
         final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
         final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
@@ -103,9 +107,9 @@ public class TestExcelHeaderSchemaStrategy {
 
     @Test
     void testWhereInferenceRowHasMoreCellsThanFieldNames() throws Exception {
-        Object[][] data = {{"ID", "First", "Middle"}, {1, "Manny", "M"}, {2, 
"Moe", "M", "Extra"}, {3, "Jack", "J"}};
-        final ByteArrayOutputStream outputStream = 
getSingleSheetWorkbook(data);
-        final Map<PropertyDescriptor, String> properties = new HashMap<>();
+        Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "Manny", 
"M"}, {2, "Moe", "M", "Extra"}, {3, "Jack", "J"}};
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+        final Map<PropertyDescriptor, String> properties = Map.of();
         final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
         final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
@@ -117,9 +121,9 @@ public class TestExcelHeaderSchemaStrategy {
 
     @Test
     void testWhereTotalRowsLessThanConfiguredInferenceRows() throws Exception {
-        Object[][] data = {{"ID", "First", "Middle"}, {1, "Manny", "M"}, {2, 
"Moe", "M"}, {3, "Jack", "J"}};
-        final ByteArrayOutputStream outputStream = 
getSingleSheetWorkbook(data);
-        final Map<PropertyDescriptor, String> properties = new HashMap<>();
+        Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "Manny", 
"M"}, {2, "Moe", "M"}, {3, "Jack", "J"}};
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+        final Map<PropertyDescriptor, String> properties = Map.of();
         final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
         final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
@@ -130,12 +134,12 @@ public class TestExcelHeaderSchemaStrategy {
 
     @Test
     void testWhereConfiguredInferenceRowsHasAnEmptyRow() throws IOException {
-        Object[][] data = {{"ID", "First", "Middle"}, {1, "One", "O"}, {2, 
"Two", "T"}, {3, "Three", "T"},
+        Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "One", "O"}, 
{2, "Two", "T"}, {3, "Three", "T"},
                 {4, "Four", "F"}, {5, "Five", "F"}, {}, {7, "Seven", "S"}, {8, 
"Eight", "E"},
                 {9, "Nine", "N"}, {10, "Ten", "T"}};
 
-        final ByteArrayOutputStream outputStream = 
getSingleSheetWorkbook(data);
-        final Map<PropertyDescriptor, String> properties = new HashMap<>();
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+        final Map<PropertyDescriptor, String> properties = Map.of();
         final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
         final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
@@ -146,12 +150,12 @@ public class TestExcelHeaderSchemaStrategy {
 
     @Test
     void testWhereTotalRowsGreaterThanConfiguredInferenceRows() throws 
Exception {
-        Object[][] data = {{"ID", "First", "Middle"}, {1, "One", "O"}, {2, 
"Two", "T"}, {3, "Three", "T"},
+        Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "One", "O"}, 
{2, "Two", "T"}, {3, "Three", "T"},
                 {4, "Four", "F"}, {5, "Five", "F"}, {6, "Six", "S"}, {7, 
"Seven", "S"}, {8, "Eight", "E"},
                 {9, "Nine", "N"}, {10, "Ten", "T"}, {11, "Eleven", "E"}};
 
-        final ByteArrayOutputStream outputStream = 
getSingleSheetWorkbook(data);
-        final Map<PropertyDescriptor, String> properties = new HashMap<>();
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+        final Map<PropertyDescriptor, String> properties = Map.of();
         final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
         final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
@@ -162,9 +166,9 @@ public class TestExcelHeaderSchemaStrategy {
 
     @Test
     void testWhereConfiguredInferenceRowsAreAllBlank() throws IOException {
-        Object[][] data = {{"ID", "First", "Middle"}, {}, {}, {}, {}, {}, {}, 
{}, {}, {}, {}, {11, "Eleven", "E"}};
-        final ByteArrayOutputStream outputStream = 
getSingleSheetWorkbook(data);
-        final Map<PropertyDescriptor, String> properties = new HashMap<>();
+        Object[][] singleSheet = {{"ID", "First", "Middle"}, {}, {}, {}, {}, 
{}, {}, {}, {}, {}, {}, {11, "Eleven", "E"}};
+        final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+        final Map<PropertyDescriptor, String> properties = Map.of();
         final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
         final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
 
@@ -174,22 +178,60 @@ public class TestExcelHeaderSchemaStrategy {
         }
     }
 
-    private static ByteArrayOutputStream getSingleSheetWorkbook(Object[][] 
data) throws IOException {
+    @Test
+    void testAlignedDateColumnsAcrossTwoSheets() throws Exception {
+        final String dateColumnName = "Date";
+        final Object[] columnNames = {dateColumnName, "Something", "Name"};
+        final Object[][] firstSheet =
+                {columnNames, {LocalDate.of(2025, 2, 1), "test1", "Sheet1"}, 
{LocalDate.of(2024, 2, 12), "test2", "Sheet1"}};
+        Object[][] secondSheet =
+                {columnNames, {LocalDate.of(1976, 9, 11), "test1", "Sheet2"}, 
{LocalDate.of(1987, 2, 12), "test2", "Sheet2"}};
+        final ByteArrayOutputStream outputStream = createWorkbook(firstSheet, 
secondSheet);
+        final Map<PropertyDescriptor, String> properties = Map.of();
+        final ConfigurationContext context = new 
MockConfigurationContext(properties, null, null);
+        final ExcelHeaderSchemaStrategy schemaStrategy = new 
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
+
+        try (final InputStream inputStream = new 
ByteArrayInputStream(outputStream.toByteArray())) {
+            final RecordSchema schema = schemaStrategy.getSchema(null, 
inputStream, null);
+            final RecordField dateRecordField = 
schema.getField(dateColumnName).orElse(null);
+
+            assertNotNull(dateRecordField);
+            assertEquals(RecordFieldType.DATE, 
dateRecordField.getDataType().getFieldType(), String.format("Expected record 
field type to be %s but it was type %s",
+                    RecordFieldType.DATE, 
dateRecordField.getDataType().getFieldType()));
+        }
+    }
+
+    private static ByteArrayOutputStream createWorkbook(Object[][]... 
sheetData) throws IOException {
         final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+
         try (XSSFWorkbook workbook = new XSSFWorkbook()) {
-            final XSSFSheet sheet = workbook.createSheet("Sheet 1");
-            int rowCount = 0;
-            for (Object[] dataRow : data) {
-                Row row = sheet.createRow(rowCount++);
-                int columnCount = 0;
-                for (Object field : dataRow) {
-                    Cell cell = row.createCell(columnCount++);
-                    if (field instanceof String) {
-                        cell.setCellValue((String) field);
-                    } else if (field instanceof Number) {
-                        cell.setCellValue(((Number) field).doubleValue());
+            CreationHelper creationHelper = workbook.getCreationHelper();
+            CellStyle dayMonthYearCellStyle = workbook.createCellStyle();
+            
dayMonthYearCellStyle.setDataFormat(creationHelper.createDataFormat().getFormat("dd/mm/yyyy"));
+            int sheetCount = 1;
+
+            for (Object[][] singleSheet : sheetData) {
+                final XSSFSheet sheet = workbook.createSheet("Sheet " + 
sheetCount);
+                int rowCount = 0;
+
+                for (Object[] singleRow : singleSheet) {
+                    Row row = sheet.createRow(rowCount++);
+                    int columnCount = 0;
+
+                    for (Object field : singleRow) {
+                        Cell cell = row.createCell(columnCount++);
+                        switch (field) {
+                            case String string -> cell.setCellValue(string);
+                            case Number number -> 
cell.setCellValue(number.doubleValue());
+                            case LocalDate localDate -> {
+                                cell.setCellValue(localDate);
+                                cell.setCellStyle(dayMonthYearCellStyle);
+                            }
+                            default -> throw new 
IllegalStateException("Unexpected value: " + field);
+                        }
                     }
                 }
+                sheetCount++;
             }
             workbook.write(outputStream);
         }

(nifi) branch main updated: NIFI-14265 Ensured aligned date columns across multiple sheets when using the Use Starting Row strategy are inferred only as dates.

Reply via email to