This is an automated email from the ASF dual-hosted git repository.
exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new 6be3f6e035 NIFI-14538 Added Input File Type with XLS support to
SplitExcel Processor (#10540)
6be3f6e035 is described below
commit 6be3f6e0359390665151845f0b920e05a1377047
Author: dan-s1 <[email protected]>
AuthorDate: Mon Nov 24 16:11:24 2025 -0500
NIFI-14538 Added Input File Type with XLS support to SplitExcel Processor
(#10540)
Signed-off-by: David Handermann <[email protected]>
---
.../java/org/apache/nifi/excel/ExcelReader.java | 4 +-
.../apache/nifi/processors/excel/SplitExcel.java | 149 ++++++++++++++++-----
.../nifi/processors/excel/TestSplitExcel.java | 22 +--
3 files changed, 132 insertions(+), 43 deletions(-)
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
index 7f38dfa9a4..81c2ae6342 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
@@ -58,8 +58,8 @@ import java.util.Map;
+ "This reader allows for inferring a schema from all the required
sheets "
+ "or providing an explicit schema for interpreting the values."
+ "See Controller Service's Usage for further documentation. "
- + "This reader is capable of processing both password and non password
protected .xlsx (XSSF 2007 OOXML file format) "
- + "and older .xls (HSSF '97(-2007) file format) Excel documents.")
+ + "This reader is capable of processing both password and non password
protected" +
+ " modern XLSX and older XLS Excel spreadsheets.")
public class ExcelReader extends SchemaRegistryService implements
RecordReaderFactory {
public static final PropertyDescriptor INPUT_FILE_TYPE = new
PropertyDescriptor
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
index 7270ec66cc..10563e2cc4 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
@@ -27,6 +27,7 @@ import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
+import org.apache.nifi.excel.InputFileType;
import org.apache.nifi.excel.ProtectionType;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
@@ -36,6 +37,10 @@ import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellCopyContext;
import org.apache.poi.ss.usermodel.CellCopyPolicy;
@@ -48,9 +53,12 @@ import org.apache.poi.ss.util.CellUtil;
import org.apache.poi.xssf.streaming.SXSSFSheet;
import org.apache.poi.xssf.streaming.SXSSFWorkbook;
+import java.io.IOException;
+import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -66,10 +74,10 @@ import static
org.apache.nifi.flowfile.attributes.FragmentAttributes.SEGMENT_ORI
@Tags({"split", "text"})
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@CapabilityDescription("This processor splits a multi sheet Microsoft Excel
spreadsheet into multiple Microsoft Excel spreadsheets where each sheet from
the original" +
- " file is converted to an individual spreadsheet in its own flow file.
Currently this processor is only capable of processing .xlsx" +
- " (XSSF 2007 OOXML file format) Excel documents and not older .xls
(HSSF '97(-2007) file format) documents." +
- " Please note all original cell styles are dropped and formulas are
removed leaving only the calculated values." +
- " Even a single sheet Microsoft Excel spreadsheet is converted to its
own flow file with all the original cell styles dropped and formulas removed."
+ " file is converted to an individual spreadsheet in its own flow file.
This processor is capable of processing both password and non password
protected" +
+ " modern XLSX and older XLS Excel spreadsheets." +
+ " Please note all original cell styles are copied and formulas are
removed leaving only the calculated values." +
+ " Even a single sheet Microsoft Excel spreadsheet is converted to its
own flow file with all the original cell styles copied and formulas removed."
)
@WritesAttributes({
@WritesAttribute(attribute = "fragment.identifier", description = "All
split Excel FlowFiles produced from the same parent Excel FlowFile will have
the same randomly generated UUID added" +
@@ -100,6 +108,14 @@ public class SplitExcel extends AbstractProcessor {
.dependsOn(PROTECTION_TYPE, ProtectionType.PASSWORD)
.build();
+ public static final PropertyDescriptor INPUT_FILE_TYPE = new
PropertyDescriptor.Builder()
+ .name("Input File Type")
+ .description("Specifies type of Excel input file.")
+ .required(true)
+ .allowableValues(InputFileType.class)
+ .defaultValue(InputFileType.XLSX)
+ .build();
+
public static final Relationship REL_ORIGINAL = new Relationship.Builder()
.name("original")
.description("The original FlowFile that was split into segments.
If the FlowFile fails processing, nothing will be sent to this relationship")
@@ -117,7 +133,8 @@ public class SplitExcel extends AbstractProcessor {
private static final List<PropertyDescriptor> PROPERTY_DESCRIPTORS =
List.of(
PROTECTION_TYPE,
- PASSWORD
+ PASSWORD,
+ INPUT_FILE_TYPE
);
private static final Set<Relationship> RELATIONSHIPS = Set.of(
@@ -126,15 +143,26 @@ public class SplitExcel extends AbstractProcessor {
REL_SPLIT
);
- private static final CellCopyPolicy CELL_COPY_POLICY = new
CellCopyPolicy.Builder()
+ private static final CellCopyPolicy XSSF_CELL_COPY_POLICY = new
CellCopyPolicy.Builder()
.cellFormula(false) // NOTE: setting to false allows for copying
the evaluated formula value.
.cellStyle(CellCopyPolicy.DEFAULT_COPY_CELL_STYLE_POLICY)
.cellValue(CellCopyPolicy.DEFAULT_COPY_CELL_VALUE_POLICY)
.condenseRows(CellCopyPolicy.DEFAULT_CONDENSE_ROWS_POLICY)
- .copyHyperlink(false) // NOTE: the hyperlinks appear at end of
sheet, so we need to iterate them separately at the end.
.mergeHyperlink(CellCopyPolicy.DEFAULT_MERGE_HYPERLINK_POLICY)
+ .rowHeight(CellCopyPolicy.DEFAULT_COPY_ROW_HEIGHT_POLICY)
+ .copyHyperlink(false) // NOTE: the hyperlinks appear at end of
sheet, so we need to iterate them separately at the end.
.mergedRegions(false) // NOTE: set to false because of the
explicit merge region handling in the copyRows method.
+ .build();
+
+ private static final CellCopyPolicy HSSF_CELL_COPY_POLICY = new
CellCopyPolicy.Builder()
+ .cellFormula(false) // NOTE: setting to false allows for copying
the evaluated formula value.
+ .cellStyle(CellCopyPolicy.DEFAULT_COPY_CELL_STYLE_POLICY)
+ .cellValue(CellCopyPolicy.DEFAULT_COPY_CELL_VALUE_POLICY)
+ .condenseRows(CellCopyPolicy.DEFAULT_CONDENSE_ROWS_POLICY)
+ .mergeHyperlink(CellCopyPolicy.DEFAULT_MERGE_HYPERLINK_POLICY)
.rowHeight(CellCopyPolicy.DEFAULT_COPY_ROW_HEIGHT_POLICY)
+ .copyHyperlink(CellCopyPolicy.DEFAULT_COPY_HYPERLINK_POLICY)
+ .mergedRegions(CellCopyPolicy.DEFAULT_COPY_MERGED_REGIONS_POLICY)
.build();
@Override
@@ -156,35 +184,16 @@ public class SplitExcel extends AbstractProcessor {
final ProtectionType protectionType =
context.getProperty(PROTECTION_TYPE).asAllowableValue(ProtectionType.class);
final String password = protectionType == ProtectionType.PASSWORD ?
context.getProperty(PASSWORD).getValue() : null;
+ final InputFileType inputFileType =
context.getProperty(INPUT_FILE_TYPE).asAllowableValue(InputFileType.class);
final List<WorkbookSplit> workbookSplits = new ArrayList<>();
try {
session.read(originalFlowFile, in -> {
- final Workbook originalWorkbook = StreamingReader.builder()
- .rowCacheSize(100)
- .bufferSize(4096)
- .password(password)
- .setReadHyperlinks(true) // NOTE: Needed for copying
rows.
- .setReadSharedFormulas(true) // NOTE: If not set to
true, then data with shared formulas fail.
- .open(in);
-
- int index = 0;
- for (final Sheet originalSheet : originalWorkbook) {
- final String originalSheetName =
originalSheet.getSheetName();
-
- try (final SXSSFWorkbook newWorkbook = new
SXSSFWorkbook(null, SXSSFWorkbook.DEFAULT_WINDOW_SIZE, false, true)) {
- final SXSSFSheet newSheet =
newWorkbook.createSheet(originalSheetName);
- final int numberOfCopiedRows = copyRows(originalSheet,
newSheet);
-
- final FlowFile newFlowFile =
session.create(originalFlowFile);
- try (final OutputStream out =
session.write(newFlowFile)) {
- newWorkbook.write(out);
- workbookSplits.add(new WorkbookSplit(index,
newFlowFile, originalSheetName, numberOfCopiedRows));
- }
- }
-
- index++;
+ if (inputFileType == InputFileType.XLSX) {
+ handleXSSF(session, originalFlowFile, in, password,
workbookSplits);
+ } else {
+ handleHSSF(session, originalFlowFile, in, password,
workbookSplits);
}
});
} catch (ExcelRuntimeException | IllegalStateException |
ProcessException e) {
@@ -229,6 +238,35 @@ public class SplitExcel extends AbstractProcessor {
session.transfer(flowFileSplits, REL_SPLIT);
}
+ private void handleXSSF(ProcessSession session, FlowFile originalFlowFile,
InputStream inputStream, String password,
+ List<WorkbookSplit> workbookSplits) throws
IOException {
+ final Workbook originalWorkbook = StreamingReader.builder()
+ .rowCacheSize(100)
+ .bufferSize(4096)
+ .password(password)
+ .setReadHyperlinks(true) // NOTE: Needed for copying rows.
+ .setReadSharedFormulas(true) // NOTE: If not set to true, then
data with shared formulas fail.
+ .open(inputStream);
+
+ int index = 0;
+ for (final Sheet originalSheet : originalWorkbook) {
+ final String originalSheetName = originalSheet.getSheetName();
+
+ try (final SXSSFWorkbook newWorkbook = new SXSSFWorkbook(null,
SXSSFWorkbook.DEFAULT_WINDOW_SIZE, false, true)) {
+ final SXSSFSheet newSheet =
newWorkbook.createSheet(originalSheetName);
+ final int numberOfCopiedRows = copyRows(originalSheet,
newSheet);
+
+ final FlowFile newFlowFile = session.create(originalFlowFile);
+ try (final OutputStream out = session.write(newFlowFile)) {
+ newWorkbook.write(out);
+ workbookSplits.add(new WorkbookSplit(index, newFlowFile,
originalSheetName, numberOfCopiedRows));
+ }
+ }
+
+ index++;
+ }
+ }
+
private int copyRows(final Sheet originalSheet, final SXSSFSheet
destinationSheet) {
final CellCopyContext cellCopyContext = new CellCopyContext();
int rowCount = 0;
@@ -239,7 +277,7 @@ public class SplitExcel extends AbstractProcessor {
for (final Cell sourceCell : sourceRow) {
final Cell destCell =
destinationRow.createCell(sourceCell.getColumnIndex());
- CellUtil.copyCell(sourceCell, destCell, CELL_COPY_POLICY,
cellCopyContext);
+ CellUtil.copyCell(sourceCell, destCell, XSSF_CELL_COPY_POLICY,
cellCopyContext);
}
rowCount++;
@@ -256,5 +294,50 @@ public class SplitExcel extends AbstractProcessor {
return rowCount;
}
- private record WorkbookSplit(int index, FlowFile content, String
sheetName, int numRows) { }
+ private void handleHSSF(ProcessSession session, FlowFile originalFlowFile,
InputStream inputStream, String password, List<WorkbookSplit> workbookSplits) {
+ // Providing the password to the HSSFWorkbook is done by setting a
thread variable managed by
+ // Biff8EncryptionKey. After the workbook is created, the thread
variable can be cleared.
+ Biff8EncryptionKey.setCurrentUserPassword(password);
+
+ try {
+ final HSSFWorkbook originalWorkbook = new
HSSFWorkbook(inputStream);
+ final Iterator<Sheet> originalSheetsIterator =
originalWorkbook.sheetIterator();
+ final CellCopyContext cellCopyContext = new CellCopyContext();
+
+ int index = 0;
+ while (originalSheetsIterator.hasNext()) {
+ final HSSFSheet originalSheet = (HSSFSheet)
originalSheetsIterator.next();
+ final String originalSheetName = originalSheet.getSheetName();
+ //NOTE: Per the POI Javadocs, the rowIterator returns an
iterator of the physical rows,
+ // hence the original number of rows should reflect this.
+ final int originalNumRows =
originalSheet.getPhysicalNumberOfRows();
+ final Iterator<Row> originalRowsIterator =
originalSheet.rowIterator();
+
+ try (HSSFWorkbook newWorkbook = new HSSFWorkbook()) {
+ final HSSFSheet newSheet =
newWorkbook.createSheet(originalSheetName);
+ while (originalRowsIterator.hasNext()) {
+ HSSFRow originalRow = (HSSFRow)
originalRowsIterator.next();
+ HSSFRow newRow =
newSheet.createRow(originalRow.getRowNum());
+ newRow.copyRowFrom(originalRow, HSSF_CELL_COPY_POLICY,
cellCopyContext);
+ }
+
+ FlowFile newFlowFile = session.create(originalFlowFile);
+
+ try (final OutputStream out = session.write(newFlowFile)) {
+ newWorkbook.write(out);
+ workbookSplits.add(new WorkbookSplit(index,
newFlowFile, originalSheetName, originalNumRows));
+ }
+ }
+ index++;
+ }
+
+ } catch (final IOException e) {
+ throw new ProcessException("Failed to split XLS file", e);
+ } finally {
+ Biff8EncryptionKey.setCurrentUserPassword(null);
+ }
+ }
+
+ private record WorkbookSplit(int index, FlowFile content, String
sheetName, int numRows) {
+ }
}
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
index b0540da260..b6526f821a 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
@@ -16,9 +16,11 @@
*/
package org.apache.nifi.processors.excel;
+import org.apache.nifi.excel.InputFileType;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.CellType;
@@ -26,12 +28,14 @@ import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFHyperlink;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@@ -198,8 +202,9 @@ public class TestSplitExcel {
}
}
- @Test
- void testCopyDateTime() throws Exception {
+ @ParameterizedTest
+ @EnumSource(InputFileType.class)
+ void testCopyDateTime(InputFileType inputFileType) throws Exception {
final LocalDateTime localDateTime = LocalDateTime.of(2023, 1, 1, 0, 0,
0);
final LocalDateTime nonValidExcelDate = LocalDateTime.of(1899, 12, 31,
0, 0, 0);
@@ -210,14 +215,15 @@ public class TestSplitExcel {
};
final ByteArrayOutputStream workbookOutputStream = new
ByteArrayOutputStream();
- try (XSSFWorkbook workbook = new XSSFWorkbook()) {
- final XSSFSheet sheet = workbook.createSheet("SomeSheetName");
+ try (Workbook workbook = InputFileType.XLSX == inputFileType ? new
XSSFWorkbook() : new HSSFWorkbook()) {
+ final Sheet sheet = workbook.createSheet("SomeSheetName");
populateSheet(sheet, data);
setCellStyles(sheet, workbook);
workbook.write(workbookOutputStream);
}
final ByteArrayInputStream input = new
ByteArrayInputStream(workbookOutputStream.toByteArray());
+ runner.setProperty(SplitExcel.INPUT_FILE_TYPE,
inputFileType.getValue());
runner.enqueue(input);
runner.run();
@@ -226,7 +232,7 @@ public class TestSplitExcel {
runner.assertTransferCount(SplitExcel.REL_FAILURE, 0);
final MockFlowFile flowFile =
runner.getFlowFilesForRelationship(SplitExcel.REL_SPLIT).getFirst();
- try (XSSFWorkbook workbook = new
XSSFWorkbook(flowFile.getContentStream())) {
+ try (Workbook workbook = InputFileType.XLSX == inputFileType ? new
XSSFWorkbook(flowFile.getContentStream()) : new
HSSFWorkbook(flowFile.getContentStream())) {
final Sheet firstSheet = workbook.sheetIterator().next();
List<List<Cell>> dateCells =
Stream.iterate(firstSheet.getFirstRowNum() + 1, rowIndex -> rowIndex + 1)
@@ -273,7 +279,7 @@ public class TestSplitExcel {
}
}
- private static void populateSheet(XSSFSheet sheet, Object[][] data) {
+ private static void populateSheet(Sheet sheet, Object[][] data) {
int rowCount = 0;
for (Object[] dataRow : data) {
Row row = sheet.createRow(rowCount++);
@@ -293,7 +299,7 @@ public class TestSplitExcel {
}
}
- void setCellStyles(XSSFSheet sheet, XSSFWorkbook workbook) {
+ void setCellStyles(Sheet sheet, Workbook workbook) {
CreationHelper creationHelper = workbook.getCreationHelper();
CellStyle dayMonthYearCellStyle = workbook.createCellStyle();
dayMonthYearCellStyle.setDataFormat(creationHelper.createDataFormat().getFormat("dd/mm/yyyy"));