This is an automated email from the ASF dual-hosted git repository.
chriss pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new c96b75076e NIFI-14265 Ensured aligned date columns across multiple
sheets when using the Use Starting Row strategy are inferred only as dates.
c96b75076e is described below
commit c96b75076e8d672f3c69c0f7793aaf7b13694857
Author: dan-s1 <[email protected]>
AuthorDate: Wed Feb 26 16:59:32 2025 +0000
NIFI-14265 Ensured aligned date columns across multiple sheets when using
the Use Starting Row strategy are inferred only as dates.
Signed-off-by: Chris Sampson <[email protected]>
This closes #9750.
---
.../nifi/excel/ExcelHeaderSchemaStrategy.java | 25 ++---
.../nifi/excel/TestExcelHeaderSchemaStrategy.java | 110 ++++++++++++++-------
2 files changed, 90 insertions(+), 45 deletions(-)
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
index e77ec9803f..5db05490d2 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
@@ -47,7 +47,8 @@ public class ExcelHeaderSchemaStrategy implements
SchemaAccessStrategy {
static final int NUM_ROWS_TO_DETERMINE_TYPES = 10; // NOTE: This number is
arbitrary.
static final AllowableValue USE_STARTING_ROW = new AllowableValue("Use
Starting Row", "Use Starting Row",
"The configured first row of the Excel file is a header line that
contains the names of the columns. The schema will be derived by using the "
- + "column names in the header and the following " +
NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of each column");
+ + "column names in the header of the first sheet and the
following " + NUM_ROWS_TO_DETERMINE_TYPES + " rows to determine the type(s) of
each column " +
+ "while the configured header rows of subsequent sheets
are skipped.");
private final PropertyContext context;
private final ComponentLog logger;
@@ -85,16 +86,18 @@ public class ExcelHeaderSchemaStrategy implements
SchemaAccessStrategy {
int index = 0;
while (rowIterator.hasNext()) {
- Row row = rowIterator.next();
- if (index == 0) {
- fieldNames = getFieldNames(firstRow, row);
- } else if (index <= NUM_ROWS_TO_DETERMINE_TYPES) {
- inferSchema(row, fieldNames, typeMap);
- } else {
- break;
- }
-
- index++;
+ Row row = rowIterator.next();
+ if (index == 0) {
+ fieldNames = getFieldNames(firstRow, row);
+ } else if (row.getRowNum() == zeroBasedFirstRow) { // skip first
row of all sheets
+ continue;
+ } else if (index <= NUM_ROWS_TO_DETERMINE_TYPES) {
+ inferSchema(row, fieldNames, typeMap);
+ } else {
+ break;
+ }
+
+ index++;
}
if (typeMap.isEmpty()) {
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
index 71a16a3aea..f431bbc4be 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelHeaderSchemaStrategy.java
@@ -22,9 +22,12 @@ import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.schema.access.SchemaNotFoundException;
import org.apache.nifi.schema.inference.TimeValueInference;
import org.apache.nifi.serialization.record.RecordField;
+import org.apache.nifi.serialization.record.RecordFieldType;
import org.apache.nifi.serialization.record.RecordSchema;
import org.apache.nifi.util.MockConfigurationContext;
import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
@@ -41,12 +44,13 @@ import java.io.InputStream;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.util.HashMap;
+import java.time.LocalDate;
import java.util.Map;
import static java.nio.file.Files.newDirectoryStream;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -74,9 +78,9 @@ public class TestExcelHeaderSchemaStrategy {
@Test
void testWhereConfiguredStartRowIsEmpty() throws IOException {
- Object[][] data = {{}, {1, "Manny"}, {2, "Moe"}, {3, "Jack"}};
- final ByteArrayOutputStream outputStream =
getSingleSheetWorkbook(data);
- final Map<PropertyDescriptor, String> properties = new HashMap<>();
+ Object[][] singleSheet = {{}, {1, "Manny"}, {2, "Moe"}, {3, "Jack"}};
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+ final Map<PropertyDescriptor, String> properties = Map.of();
final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
@@ -88,9 +92,9 @@ public class TestExcelHeaderSchemaStrategy {
@Test
void testWhereConfiguredStartRowHasEmptyCell() throws Exception {
- Object[][] data = {{"ID", "", "Middle"}, {1, "Manny", "M"}, {2, "Moe",
"M"}, {3, "Jack", "J"}};
- final ByteArrayOutputStream outputStream =
getSingleSheetWorkbook(data);
- final Map<PropertyDescriptor, String> properties = new HashMap<>();
+ Object[][] singleSheet = {{"ID", "", "Middle"}, {1, "Manny", "M"}, {2,
"Moe", "M"}, {3, "Jack", "J"}};
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+ final Map<PropertyDescriptor, String> properties = Map.of();
final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
@@ -103,9 +107,9 @@ public class TestExcelHeaderSchemaStrategy {
@Test
void testWhereInferenceRowHasMoreCellsThanFieldNames() throws Exception {
- Object[][] data = {{"ID", "First", "Middle"}, {1, "Manny", "M"}, {2,
"Moe", "M", "Extra"}, {3, "Jack", "J"}};
- final ByteArrayOutputStream outputStream =
getSingleSheetWorkbook(data);
- final Map<PropertyDescriptor, String> properties = new HashMap<>();
+ Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "Manny",
"M"}, {2, "Moe", "M", "Extra"}, {3, "Jack", "J"}};
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+ final Map<PropertyDescriptor, String> properties = Map.of();
final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
@@ -117,9 +121,9 @@ public class TestExcelHeaderSchemaStrategy {
@Test
void testWhereTotalRowsLessThanConfiguredInferenceRows() throws Exception {
- Object[][] data = {{"ID", "First", "Middle"}, {1, "Manny", "M"}, {2,
"Moe", "M"}, {3, "Jack", "J"}};
- final ByteArrayOutputStream outputStream =
getSingleSheetWorkbook(data);
- final Map<PropertyDescriptor, String> properties = new HashMap<>();
+ Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "Manny",
"M"}, {2, "Moe", "M"}, {3, "Jack", "J"}};
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+ final Map<PropertyDescriptor, String> properties = Map.of();
final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
@@ -130,12 +134,12 @@ public class TestExcelHeaderSchemaStrategy {
@Test
void testWhereConfiguredInferenceRowsHasAnEmptyRow() throws IOException {
- Object[][] data = {{"ID", "First", "Middle"}, {1, "One", "O"}, {2,
"Two", "T"}, {3, "Three", "T"},
+ Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "One", "O"},
{2, "Two", "T"}, {3, "Three", "T"},
{4, "Four", "F"}, {5, "Five", "F"}, {}, {7, "Seven", "S"}, {8,
"Eight", "E"},
{9, "Nine", "N"}, {10, "Ten", "T"}};
- final ByteArrayOutputStream outputStream =
getSingleSheetWorkbook(data);
- final Map<PropertyDescriptor, String> properties = new HashMap<>();
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+ final Map<PropertyDescriptor, String> properties = Map.of();
final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
@@ -146,12 +150,12 @@ public class TestExcelHeaderSchemaStrategy {
@Test
void testWhereTotalRowsGreaterThanConfiguredInferenceRows() throws
Exception {
- Object[][] data = {{"ID", "First", "Middle"}, {1, "One", "O"}, {2,
"Two", "T"}, {3, "Three", "T"},
+ Object[][] singleSheet = {{"ID", "First", "Middle"}, {1, "One", "O"},
{2, "Two", "T"}, {3, "Three", "T"},
{4, "Four", "F"}, {5, "Five", "F"}, {6, "Six", "S"}, {7,
"Seven", "S"}, {8, "Eight", "E"},
{9, "Nine", "N"}, {10, "Ten", "T"}, {11, "Eleven", "E"}};
- final ByteArrayOutputStream outputStream =
getSingleSheetWorkbook(data);
- final Map<PropertyDescriptor, String> properties = new HashMap<>();
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+ final Map<PropertyDescriptor, String> properties = Map.of();
final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
@@ -162,9 +166,9 @@ public class TestExcelHeaderSchemaStrategy {
@Test
void testWhereConfiguredInferenceRowsAreAllBlank() throws IOException {
- Object[][] data = {{"ID", "First", "Middle"}, {}, {}, {}, {}, {}, {},
{}, {}, {}, {}, {11, "Eleven", "E"}};
- final ByteArrayOutputStream outputStream =
getSingleSheetWorkbook(data);
- final Map<PropertyDescriptor, String> properties = new HashMap<>();
+ Object[][] singleSheet = {{"ID", "First", "Middle"}, {}, {}, {}, {},
{}, {}, {}, {}, {}, {}, {11, "Eleven", "E"}};
+ final ByteArrayOutputStream outputStream = createWorkbook(singleSheet);
+ final Map<PropertyDescriptor, String> properties = Map.of();
final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
@@ -174,22 +178,60 @@ public class TestExcelHeaderSchemaStrategy {
}
}
- private static ByteArrayOutputStream getSingleSheetWorkbook(Object[][]
data) throws IOException {
+ @Test
+ void testAlignedDateColumnsAcrossTwoSheets() throws Exception {
+ final String dateColumnName = "Date";
+ final Object[] columnNames = {dateColumnName, "Something", "Name"};
+ final Object[][] firstSheet =
+ {columnNames, {LocalDate.of(2025, 2, 1), "test1", "Sheet1"},
{LocalDate.of(2024, 2, 12), "test2", "Sheet1"}};
+ Object[][] secondSheet =
+ {columnNames, {LocalDate.of(1976, 9, 11), "test1", "Sheet2"},
{LocalDate.of(1987, 2, 12), "test2", "Sheet2"}};
+ final ByteArrayOutputStream outputStream = createWorkbook(firstSheet,
secondSheet);
+ final Map<PropertyDescriptor, String> properties = Map.of();
+ final ConfigurationContext context = new
MockConfigurationContext(properties, null, null);
+ final ExcelHeaderSchemaStrategy schemaStrategy = new
ExcelHeaderSchemaStrategy(context, logger, TIME_VALUE_INFERENCE);
+
+ try (final InputStream inputStream = new
ByteArrayInputStream(outputStream.toByteArray())) {
+ final RecordSchema schema = schemaStrategy.getSchema(null,
inputStream, null);
+ final RecordField dateRecordField =
schema.getField(dateColumnName).orElse(null);
+
+ assertNotNull(dateRecordField);
+ assertEquals(RecordFieldType.DATE,
dateRecordField.getDataType().getFieldType(), String.format("Expected record
field type to be %s but it was type %s",
+ RecordFieldType.DATE,
dateRecordField.getDataType().getFieldType()));
+ }
+ }
+
+ private static ByteArrayOutputStream createWorkbook(Object[][]...
sheetData) throws IOException {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+
try (XSSFWorkbook workbook = new XSSFWorkbook()) {
- final XSSFSheet sheet = workbook.createSheet("Sheet 1");
- int rowCount = 0;
- for (Object[] dataRow : data) {
- Row row = sheet.createRow(rowCount++);
- int columnCount = 0;
- for (Object field : dataRow) {
- Cell cell = row.createCell(columnCount++);
- if (field instanceof String) {
- cell.setCellValue((String) field);
- } else if (field instanceof Number) {
- cell.setCellValue(((Number) field).doubleValue());
+ CreationHelper creationHelper = workbook.getCreationHelper();
+ CellStyle dayMonthYearCellStyle = workbook.createCellStyle();
+
dayMonthYearCellStyle.setDataFormat(creationHelper.createDataFormat().getFormat("dd/mm/yyyy"));
+ int sheetCount = 1;
+
+ for (Object[][] singleSheet : sheetData) {
+ final XSSFSheet sheet = workbook.createSheet("Sheet " +
sheetCount);
+ int rowCount = 0;
+
+ for (Object[] singleRow : singleSheet) {
+ Row row = sheet.createRow(rowCount++);
+ int columnCount = 0;
+
+ for (Object field : singleRow) {
+ Cell cell = row.createCell(columnCount++);
+ switch (field) {
+ case String string -> cell.setCellValue(string);
+ case Number number ->
cell.setCellValue(number.doubleValue());
+ case LocalDate localDate -> {
+ cell.setCellValue(localDate);
+ cell.setCellStyle(dayMonthYearCellStyle);
+ }
+ default -> throw new
IllegalStateException("Unexpected value: " + field);
+ }
}
}
+ sheetCount++;
}
workbook.write(outputStream);
}