(nifi) branch main updated: NIFI-14538 Added Input File Type with XLS support to SplitExcel Processor (#10540)

exceptionfactory Mon, 24 Nov 2025 13:11:55 -0800

This is an automated email from the ASF dual-hosted git repository.

exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git



The following commit(s) were added to refs/heads/main by this push:
     new 6be3f6e035 NIFI-14538 Added Input File Type with XLS support to 
SplitExcel Processor (#10540)
6be3f6e035 is described below

commit 6be3f6e0359390665151845f0b920e05a1377047
Author: dan-s1 <[email protected]>
AuthorDate: Mon Nov 24 16:11:24 2025 -0500

    NIFI-14538 Added Input File Type with XLS support to SplitExcel Processor 
(#10540)
    
    Signed-off-by: David Handermann <[email protected]>
---
 .../java/org/apache/nifi/excel/ExcelReader.java    |   4 +-
 .../apache/nifi/processors/excel/SplitExcel.java   | 149 ++++++++++++++++-----
 .../nifi/processors/excel/TestSplitExcel.java      |  22 +--
 3 files changed, 132 insertions(+), 43 deletions(-)

diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
index 7f38dfa9a4..81c2ae6342 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
@@ -58,8 +58,8 @@ import java.util.Map;
         + "This reader allows for inferring a schema from all the required 
sheets "
         + "or providing an explicit schema for interpreting the values."
         + "See Controller Service's Usage for further documentation. "
-        + "This reader is capable of processing both password and non password 
protected .xlsx (XSSF 2007 OOXML file format) "
-        + "and older .xls (HSSF '97(-2007) file format) Excel documents.")
+        + "This reader is capable of processing both password and non password 
protected" +
+        " modern XLSX and older XLS Excel spreadsheets.")
 public class ExcelReader extends SchemaRegistryService implements 
RecordReaderFactory {
 
     public static final PropertyDescriptor INPUT_FILE_TYPE = new 
PropertyDescriptor
diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
index 7270ec66cc..10563e2cc4 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
@@ -27,6 +27,7 @@ import org.apache.nifi.annotation.behavior.WritesAttributes;
 import org.apache.nifi.annotation.documentation.CapabilityDescription;
 import org.apache.nifi.annotation.documentation.Tags;
 import org.apache.nifi.components.PropertyDescriptor;
+import org.apache.nifi.excel.InputFileType;
 import org.apache.nifi.excel.ProtectionType;
 import org.apache.nifi.flowfile.FlowFile;
 import org.apache.nifi.flowfile.attributes.CoreAttributes;
@@ -36,6 +37,10 @@ import org.apache.nifi.processor.ProcessSession;
 import org.apache.nifi.processor.Relationship;
 import org.apache.nifi.processor.exception.ProcessException;
 import org.apache.nifi.processor.util.StandardValidators;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.hssf.usermodel.HSSFRow;
+import org.apache.poi.hssf.usermodel.HSSFSheet;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.ss.usermodel.Cell;
 import org.apache.poi.ss.usermodel.CellCopyContext;
 import org.apache.poi.ss.usermodel.CellCopyPolicy;
@@ -48,9 +53,12 @@ import org.apache.poi.ss.util.CellUtil;
 import org.apache.poi.xssf.streaming.SXSSFSheet;
 import org.apache.poi.xssf.streaming.SXSSFWorkbook;
 
+import java.io.IOException;
+import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -66,10 +74,10 @@ import static 
org.apache.nifi.flowfile.attributes.FragmentAttributes.SEGMENT_ORI
 @Tags({"split", "text"})
 @InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
 @CapabilityDescription("This processor splits a multi sheet Microsoft Excel 
spreadsheet into multiple Microsoft Excel spreadsheets where each sheet from 
the original" +
-        " file is converted to an individual spreadsheet in its own flow file. 
Currently this processor is only capable of processing .xlsx" +
-        " (XSSF 2007 OOXML file format) Excel documents and not older .xls 
(HSSF '97(-2007) file format) documents." +
-        " Please note all original cell styles are dropped and formulas are 
removed leaving only the calculated values." +
-        " Even a single sheet Microsoft Excel spreadsheet is converted to its 
own flow file with all the original cell styles dropped and formulas removed."
+        " file is converted to an individual spreadsheet in its own flow file. 
This processor is capable of processing both password and non password 
protected" +
+        " modern XLSX and older XLS Excel spreadsheets." +
+        " Please note all original cell styles are copied and formulas are 
removed leaving only the calculated values." +
+        " Even a single sheet Microsoft Excel spreadsheet is converted to its 
own flow file with all the original cell styles copied and formulas removed."
 )
 @WritesAttributes({
         @WritesAttribute(attribute = "fragment.identifier", description = "All 
split Excel FlowFiles produced from the same parent Excel FlowFile will have 
the same randomly generated UUID added" +
@@ -100,6 +108,14 @@ public class SplitExcel extends AbstractProcessor {
             .dependsOn(PROTECTION_TYPE, ProtectionType.PASSWORD)
             .build();
 
+    public static final PropertyDescriptor INPUT_FILE_TYPE = new 
PropertyDescriptor.Builder()
+            .name("Input File Type")
+            .description("Specifies type of Excel input file.")
+            .required(true)
+            .allowableValues(InputFileType.class)
+            .defaultValue(InputFileType.XLSX)
+            .build();
+
     public static final Relationship REL_ORIGINAL = new Relationship.Builder()
             .name("original")
             .description("The original FlowFile that was split into segments. 
If the FlowFile fails processing, nothing will be sent to this relationship")
@@ -117,7 +133,8 @@ public class SplitExcel extends AbstractProcessor {
 
     private static final List<PropertyDescriptor> PROPERTY_DESCRIPTORS = 
List.of(
             PROTECTION_TYPE,
-            PASSWORD
+            PASSWORD,
+            INPUT_FILE_TYPE
     );
 
     private static final Set<Relationship> RELATIONSHIPS = Set.of(
@@ -126,15 +143,26 @@ public class SplitExcel extends AbstractProcessor {
             REL_SPLIT
     );
 
-    private static final CellCopyPolicy CELL_COPY_POLICY = new 
CellCopyPolicy.Builder()
+    private static final CellCopyPolicy XSSF_CELL_COPY_POLICY = new 
CellCopyPolicy.Builder()
             .cellFormula(false) // NOTE: setting to false allows for copying 
the evaluated formula value.
             .cellStyle(CellCopyPolicy.DEFAULT_COPY_CELL_STYLE_POLICY)
             .cellValue(CellCopyPolicy.DEFAULT_COPY_CELL_VALUE_POLICY)
             .condenseRows(CellCopyPolicy.DEFAULT_CONDENSE_ROWS_POLICY)
-            .copyHyperlink(false) // NOTE: the hyperlinks appear at end of 
sheet, so we need to iterate them separately at the end.
             .mergeHyperlink(CellCopyPolicy.DEFAULT_MERGE_HYPERLINK_POLICY)
+            .rowHeight(CellCopyPolicy.DEFAULT_COPY_ROW_HEIGHT_POLICY)
+            .copyHyperlink(false) // NOTE: the hyperlinks appear at end of 
sheet, so we need to iterate them separately at the end.
             .mergedRegions(false) // NOTE: set to false because of the 
explicit merge region handling in the copyRows method.
+            .build();
+
+    private static final CellCopyPolicy HSSF_CELL_COPY_POLICY = new 
CellCopyPolicy.Builder()
+            .cellFormula(false) // NOTE: setting to false allows for copying 
the evaluated formula value.
+            .cellStyle(CellCopyPolicy.DEFAULT_COPY_CELL_STYLE_POLICY)
+            .cellValue(CellCopyPolicy.DEFAULT_COPY_CELL_VALUE_POLICY)
+            .condenseRows(CellCopyPolicy.DEFAULT_CONDENSE_ROWS_POLICY)
+            .mergeHyperlink(CellCopyPolicy.DEFAULT_MERGE_HYPERLINK_POLICY)
             .rowHeight(CellCopyPolicy.DEFAULT_COPY_ROW_HEIGHT_POLICY)
+            .copyHyperlink(CellCopyPolicy.DEFAULT_COPY_HYPERLINK_POLICY)
+            .mergedRegions(CellCopyPolicy.DEFAULT_COPY_MERGED_REGIONS_POLICY)
             .build();
 
     @Override
@@ -156,35 +184,16 @@ public class SplitExcel extends AbstractProcessor {
 
         final ProtectionType protectionType = 
context.getProperty(PROTECTION_TYPE).asAllowableValue(ProtectionType.class);
         final String password = protectionType == ProtectionType.PASSWORD ? 
context.getProperty(PASSWORD).getValue() : null;
+        final InputFileType inputFileType = 
context.getProperty(INPUT_FILE_TYPE).asAllowableValue(InputFileType.class);
         final List<WorkbookSplit> workbookSplits = new ArrayList<>();
 
         try {
             session.read(originalFlowFile, in -> {
 
-                final Workbook originalWorkbook = StreamingReader.builder()
-                        .rowCacheSize(100)
-                        .bufferSize(4096)
-                        .password(password)
-                        .setReadHyperlinks(true) // NOTE: Needed for copying 
rows.
-                        .setReadSharedFormulas(true) // NOTE: If not set to 
true, then data with shared formulas fail.
-                        .open(in);
-
-                int index = 0;
-                for (final Sheet originalSheet : originalWorkbook) {
-                    final String originalSheetName = 
originalSheet.getSheetName();
-
-                    try (final SXSSFWorkbook newWorkbook = new 
SXSSFWorkbook(null, SXSSFWorkbook.DEFAULT_WINDOW_SIZE, false, true)) {
-                        final SXSSFSheet newSheet = 
newWorkbook.createSheet(originalSheetName);
-                        final int numberOfCopiedRows = copyRows(originalSheet, 
newSheet);
-
-                        final FlowFile newFlowFile = 
session.create(originalFlowFile);
-                        try (final OutputStream out = 
session.write(newFlowFile)) {
-                            newWorkbook.write(out);
-                            workbookSplits.add(new WorkbookSplit(index, 
newFlowFile, originalSheetName, numberOfCopiedRows));
-                        }
-                    }
-
-                    index++;
+                if (inputFileType == InputFileType.XLSX) {
+                    handleXSSF(session, originalFlowFile, in, password, 
workbookSplits);
+                } else {
+                    handleHSSF(session, originalFlowFile, in, password, 
workbookSplits);
                 }
             });
         } catch (ExcelRuntimeException | IllegalStateException | 
ProcessException e) {
@@ -229,6 +238,35 @@ public class SplitExcel extends AbstractProcessor {
         session.transfer(flowFileSplits, REL_SPLIT);
     }
 
+    private void handleXSSF(ProcessSession session, FlowFile originalFlowFile, 
InputStream inputStream, String password,
+                            List<WorkbookSplit> workbookSplits) throws 
IOException {
+        final Workbook originalWorkbook = StreamingReader.builder()
+                .rowCacheSize(100)
+                .bufferSize(4096)
+                .password(password)
+                .setReadHyperlinks(true) // NOTE: Needed for copying rows.
+                .setReadSharedFormulas(true) // NOTE: If not set to true, then 
data with shared formulas fail.
+                .open(inputStream);
+
+        int index = 0;
+        for (final Sheet originalSheet : originalWorkbook) {
+            final String originalSheetName = originalSheet.getSheetName();
+
+            try (final SXSSFWorkbook newWorkbook = new SXSSFWorkbook(null, 
SXSSFWorkbook.DEFAULT_WINDOW_SIZE, false, true)) {
+                final SXSSFSheet newSheet = 
newWorkbook.createSheet(originalSheetName);
+                final int numberOfCopiedRows = copyRows(originalSheet, 
newSheet);
+
+                final FlowFile newFlowFile = session.create(originalFlowFile);
+                try (final OutputStream out = session.write(newFlowFile)) {
+                    newWorkbook.write(out);
+                    workbookSplits.add(new WorkbookSplit(index, newFlowFile, 
originalSheetName, numberOfCopiedRows));
+                }
+            }
+
+            index++;
+        }
+    }
+
     private int copyRows(final Sheet originalSheet, final SXSSFSheet 
destinationSheet) {
         final CellCopyContext cellCopyContext = new CellCopyContext();
         int rowCount = 0;
@@ -239,7 +277,7 @@ public class SplitExcel extends AbstractProcessor {
 
             for (final Cell sourceCell : sourceRow) {
                 final Cell destCell = 
destinationRow.createCell(sourceCell.getColumnIndex());
-                CellUtil.copyCell(sourceCell, destCell, CELL_COPY_POLICY, 
cellCopyContext);
+                CellUtil.copyCell(sourceCell, destCell, XSSF_CELL_COPY_POLICY, 
cellCopyContext);
             }
 
             rowCount++;
@@ -256,5 +294,50 @@ public class SplitExcel extends AbstractProcessor {
         return rowCount;
     }
 
-    private record WorkbookSplit(int index, FlowFile content, String 
sheetName, int numRows) { }
+    private void handleHSSF(ProcessSession session, FlowFile originalFlowFile, 
InputStream inputStream, String password, List<WorkbookSplit> workbookSplits) {
+        // Providing the password to the HSSFWorkbook is done by setting a 
thread variable managed by
+        // Biff8EncryptionKey. After the workbook is created, the thread 
variable can be cleared.
+        Biff8EncryptionKey.setCurrentUserPassword(password);
+
+        try {
+            final HSSFWorkbook originalWorkbook = new 
HSSFWorkbook(inputStream);
+            final Iterator<Sheet> originalSheetsIterator = 
originalWorkbook.sheetIterator();
+            final CellCopyContext cellCopyContext = new CellCopyContext();
+
+            int index = 0;
+            while (originalSheetsIterator.hasNext()) {
+                final HSSFSheet originalSheet = (HSSFSheet) 
originalSheetsIterator.next();
+                final String originalSheetName = originalSheet.getSheetName();
+                //NOTE: Per the POI Javadocs, the rowIterator returns an 
iterator of the physical rows,
+                // hence the original number of rows should reflect this.
+                final int originalNumRows = 
originalSheet.getPhysicalNumberOfRows();
+                final Iterator<Row> originalRowsIterator = 
originalSheet.rowIterator();
+
+                try (HSSFWorkbook newWorkbook = new HSSFWorkbook()) {
+                    final HSSFSheet newSheet = 
newWorkbook.createSheet(originalSheetName);
+                    while (originalRowsIterator.hasNext()) {
+                        HSSFRow originalRow = (HSSFRow) 
originalRowsIterator.next();
+                        HSSFRow newRow = 
newSheet.createRow(originalRow.getRowNum());
+                        newRow.copyRowFrom(originalRow, HSSF_CELL_COPY_POLICY, 
cellCopyContext);
+                    }
+
+                    FlowFile newFlowFile = session.create(originalFlowFile);
+
+                    try (final OutputStream out = session.write(newFlowFile)) {
+                        newWorkbook.write(out);
+                        workbookSplits.add(new WorkbookSplit(index, 
newFlowFile, originalSheetName, originalNumRows));
+                    }
+                }
+                index++;
+            }
+
+        } catch (final IOException e) {
+            throw new ProcessException("Failed to split XLS file", e);
+        } finally {
+            Biff8EncryptionKey.setCurrentUserPassword(null);
+        }
+    }
+
+    private record WorkbookSplit(int index, FlowFile content, String 
sheetName, int numRows) {
+    }
 }
diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
index b0540da260..b6526f821a 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
@@ -16,9 +16,11 @@
  */
 package org.apache.nifi.processors.excel;
 
+import org.apache.nifi.excel.InputFileType;
 import org.apache.nifi.util.MockFlowFile;
 import org.apache.nifi.util.TestRunner;
 import org.apache.nifi.util.TestRunners;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.ss.usermodel.Cell;
 import org.apache.poi.ss.usermodel.CellStyle;
 import org.apache.poi.ss.usermodel.CellType;
@@ -26,12 +28,14 @@ import org.apache.poi.ss.usermodel.CreationHelper;
 import org.apache.poi.ss.usermodel.DateUtil;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.xssf.usermodel.XSSFHyperlink;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
@@ -198,8 +202,9 @@ public class TestSplitExcel {
         }
     }
 
-    @Test
-    void testCopyDateTime() throws Exception {
+    @ParameterizedTest
+    @EnumSource(InputFileType.class)
+    void testCopyDateTime(InputFileType inputFileType) throws Exception {
         final LocalDateTime localDateTime = LocalDateTime.of(2023, 1, 1, 0, 0, 
0);
         final LocalDateTime nonValidExcelDate = LocalDateTime.of(1899, 12, 31, 
0, 0, 0);
 
@@ -210,14 +215,15 @@ public class TestSplitExcel {
         };
 
         final ByteArrayOutputStream workbookOutputStream = new 
ByteArrayOutputStream();
-        try (XSSFWorkbook workbook = new XSSFWorkbook()) {
-            final XSSFSheet sheet = workbook.createSheet("SomeSheetName");
+        try (Workbook workbook = InputFileType.XLSX == inputFileType ? new 
XSSFWorkbook() : new HSSFWorkbook()) {
+            final Sheet sheet = workbook.createSheet("SomeSheetName");
             populateSheet(sheet, data);
             setCellStyles(sheet, workbook);
             workbook.write(workbookOutputStream);
         }
 
         final ByteArrayInputStream input = new 
ByteArrayInputStream(workbookOutputStream.toByteArray());
+        runner.setProperty(SplitExcel.INPUT_FILE_TYPE, 
inputFileType.getValue());
         runner.enqueue(input);
         runner.run();
 
@@ -226,7 +232,7 @@ public class TestSplitExcel {
         runner.assertTransferCount(SplitExcel.REL_FAILURE, 0);
 
         final MockFlowFile flowFile = 
runner.getFlowFilesForRelationship(SplitExcel.REL_SPLIT).getFirst();
-        try (XSSFWorkbook workbook = new 
XSSFWorkbook(flowFile.getContentStream())) {
+        try (Workbook workbook = InputFileType.XLSX == inputFileType ? new 
XSSFWorkbook(flowFile.getContentStream()) : new 
HSSFWorkbook(flowFile.getContentStream())) {
             final Sheet firstSheet = workbook.sheetIterator().next();
 
             List<List<Cell>> dateCells = 
Stream.iterate(firstSheet.getFirstRowNum() + 1, rowIndex -> rowIndex + 1)
@@ -273,7 +279,7 @@ public class TestSplitExcel {
         }
     }
 
-    private static void populateSheet(XSSFSheet sheet, Object[][] data) {
+    private static void populateSheet(Sheet sheet, Object[][] data) {
         int rowCount = 0;
         for (Object[] dataRow : data) {
             Row row = sheet.createRow(rowCount++);
@@ -293,7 +299,7 @@ public class TestSplitExcel {
         }
     }
 
-    void setCellStyles(XSSFSheet sheet, XSSFWorkbook workbook) {
+    void setCellStyles(Sheet sheet, Workbook workbook) {
         CreationHelper creationHelper = workbook.getCreationHelper();
         CellStyle dayMonthYearCellStyle = workbook.createCellStyle();
         
dayMonthYearCellStyle.setDataFormat(creationHelper.createDataFormat().getFormat("dd/mm/yyyy"));

(nifi) branch main updated: NIFI-14538 Added Input File Type with XLS support to SplitExcel Processor (#10540)

Reply via email to