This is an automated email from the ASF dual-hosted git repository.

exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git


The following commit(s) were added to refs/heads/main by this push:
     new aba59357b0 NIFI-14702 Switched to Streaming Workbook in SplitExcel 
(#10058)
aba59357b0 is described below

commit aba59357b0118b77d4dc5f2d2591d55a741bdd48
Author: zhtk <[email protected]>
AuthorDate: Sat Jul 19 23:26:38 2025 +0200

    NIFI-14702 Switched to Streaming Workbook in SplitExcel (#10058)
    
    Co-authored-by: Kamilkime <[email protected]>
    Signed-off-by: David Handermann <[email protected]>
---
 .../apache/nifi/processors/excel/SplitExcel.java   |  59 +++++++++++++++------
 .../nifi/processors/excel/TestSplitExcel.java      |  34 ++++++++++++
 .../src/test/resources/excel/hyperlinks.xlsx       | Bin 0 -> 10756 bytes
 nifi-extension-bundles/nifi-poi-bundle/pom.xml     |   2 +-
 4 files changed, 77 insertions(+), 18 deletions(-)

diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
index d095edefd2..7270ec66cc 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
@@ -18,6 +18,7 @@ package org.apache.nifi.processors.excel;
 
 import com.github.pjfanning.xlsx.StreamingReader;
 import com.github.pjfanning.xlsx.exceptions.ExcelRuntimeException;
+import com.github.pjfanning.xlsx.impl.XlsxHyperlink;
 import org.apache.nifi.annotation.behavior.InputRequirement;
 import org.apache.nifi.annotation.behavior.SideEffectFree;
 import org.apache.nifi.annotation.behavior.SupportsBatching;
@@ -35,12 +36,17 @@ import org.apache.nifi.processor.ProcessSession;
 import org.apache.nifi.processor.Relationship;
 import org.apache.nifi.processor.exception.ProcessException;
 import org.apache.nifi.processor.util.StandardValidators;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellCopyContext;
 import org.apache.poi.ss.usermodel.CellCopyPolicy;
+import org.apache.poi.ss.usermodel.Hyperlink;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.ss.usermodel.Sheet;
 import org.apache.poi.ss.usermodel.Workbook;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.ss.util.CellRangeAddress;
+import org.apache.poi.ss.util.CellUtil;
+import org.apache.poi.xssf.streaming.SXSSFSheet;
+import org.apache.poi.xssf.streaming.SXSSFWorkbook;
 
 import java.io.OutputStream;
 import java.util.ArrayList;
@@ -125,9 +131,9 @@ public class SplitExcel extends AbstractProcessor {
             .cellStyle(CellCopyPolicy.DEFAULT_COPY_CELL_STYLE_POLICY)
             .cellValue(CellCopyPolicy.DEFAULT_COPY_CELL_VALUE_POLICY)
             .condenseRows(CellCopyPolicy.DEFAULT_CONDENSE_ROWS_POLICY)
-            .copyHyperlink(CellCopyPolicy.DEFAULT_COPY_HYPERLINK_POLICY)
+            .copyHyperlink(false) // NOTE: the hyperlinks appear at end of 
sheet, so we need to iterate them separately at the end.
             .mergeHyperlink(CellCopyPolicy.DEFAULT_MERGE_HYPERLINK_POLICY)
-            .mergedRegions(CellCopyPolicy.DEFAULT_COPY_MERGED_REGIONS_POLICY)
+            .mergedRegions(false) // NOTE: set to false because of the 
explicit merge region handling in the copyRows method.
             .rowHeight(CellCopyPolicy.DEFAULT_COPY_ROW_HEIGHT_POLICY)
             .build();
 
@@ -150,7 +156,6 @@ public class SplitExcel extends AbstractProcessor {
 
         final ProtectionType protectionType = 
context.getProperty(PROTECTION_TYPE).asAllowableValue(ProtectionType.class);
         final String password = protectionType == ProtectionType.PASSWORD ? 
context.getProperty(PASSWORD).getValue() : null;
-
         final List<WorkbookSplit> workbookSplits = new ArrayList<>();
 
         try {
@@ -167,21 +172,15 @@ public class SplitExcel extends AbstractProcessor {
                 int index = 0;
                 for (final Sheet originalSheet : originalWorkbook) {
                     final String originalSheetName = 
originalSheet.getSheetName();
-                    try (XSSFWorkbook newWorkbook = new XSSFWorkbook()) {
-                        XSSFSheet newSheet = 
newWorkbook.createSheet(originalSheetName);
-                        List<Row> originalRows = new ArrayList<>();
-                        for (Row originalRow : originalSheet) {
-                            originalRows.add(originalRow);
-                        }
 
-                        if (!originalRows.isEmpty()) {
-                            newSheet.copyRows(originalRows, 
originalSheet.getFirstRowNum(), CELL_COPY_POLICY);
-                        }
+                    try (final SXSSFWorkbook newWorkbook = new 
SXSSFWorkbook(null, SXSSFWorkbook.DEFAULT_WINDOW_SIZE, false, true)) {
+                        final SXSSFSheet newSheet = 
newWorkbook.createSheet(originalSheetName);
+                        final int numberOfCopiedRows = copyRows(originalSheet, 
newSheet);
 
-                        FlowFile newFlowFile = 
session.create(originalFlowFile);
+                        final FlowFile newFlowFile = 
session.create(originalFlowFile);
                         try (final OutputStream out = 
session.write(newFlowFile)) {
                             newWorkbook.write(out);
-                            workbookSplits.add(new WorkbookSplit(index, 
newFlowFile, originalSheetName, originalRows.size()));
+                            workbookSplits.add(new WorkbookSplit(index, 
newFlowFile, originalSheetName, numberOfCopiedRows));
                         }
                     }
 
@@ -230,6 +229,32 @@ public class SplitExcel extends AbstractProcessor {
         session.transfer(flowFileSplits, REL_SPLIT);
     }
 
-    private record WorkbookSplit(int index, FlowFile content, String 
sheetName, int numRows) {
+    private int copyRows(final Sheet originalSheet, final SXSSFSheet 
destinationSheet) {
+        final CellCopyContext cellCopyContext = new CellCopyContext();
+        int rowCount = 0;
+
+        for (final Row sourceRow : originalSheet) {
+            final Row destinationRow = 
destinationSheet.createRow(sourceRow.getRowNum());
+            destinationRow.setHeight(sourceRow.getHeight());
+
+            for (final Cell sourceCell : sourceRow) {
+                final Cell destCell = 
destinationRow.createCell(sourceCell.getColumnIndex());
+                CellUtil.copyCell(sourceCell, destCell, CELL_COPY_POLICY, 
cellCopyContext);
+            }
+
+            rowCount++;
+        }
+
+        for (final CellRangeAddress sourceRegion : 
originalSheet.getMergedRegions()) {
+            destinationSheet.addMergedRegion(sourceRegion.copy());
+        }
+
+        for (final Hyperlink hyperlink : originalSheet.getHyperlinkList()) {
+            destinationSheet.addHyperlink(((XlsxHyperlink) 
hyperlink).createXSSFHyperlink());
+        }
+
+        return rowCount;
     }
+
+    private record WorkbookSplit(int index, FlowFile content, String 
sheetName, int numRows) { }
 }
diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
index fdfe5ff64f..b0540da260 100644
--- 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
+++ 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
@@ -26,6 +26,7 @@ import org.apache.poi.ss.usermodel.CreationHelper;
 import org.apache.poi.ss.usermodel.DateUtil;
 import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.xssf.usermodel.XSSFHyperlink;
 import org.apache.poi.xssf.usermodel.XSSFSheet;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.junit.jupiter.api.AfterAll;
@@ -53,6 +54,7 @@ import static 
org.apache.nifi.flowfile.attributes.FragmentAttributes.FRAGMENT_ID
 import static 
org.apache.nifi.flowfile.attributes.FragmentAttributes.FRAGMENT_INDEX;
 import static 
org.apache.nifi.flowfile.attributes.FragmentAttributes.SEGMENT_ORIGINAL_FILENAME;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertIterableEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
@@ -239,6 +241,38 @@ public class TestSplitExcel {
         }
     }
 
+    @Test
+    void testHyperlinks() throws IOException {
+        final Path hyperlinksFile = 
Paths.get("src/test/resources/excel/hyperlinks.xlsx");
+        runner.enqueue(hyperlinksFile);
+
+        runner.run();
+
+        runner.assertTransferCount(SplitExcel.REL_SPLIT, 1);
+        runner.assertTransferCount(SplitExcel.REL_ORIGINAL, 1);
+        runner.assertTransferCount(SplitExcel.REL_FAILURE, 0);
+
+        final MockFlowFile flowFile = 
runner.getFlowFilesForRelationship(SplitExcel.REL_SPLIT).getFirst();
+        try (XSSFWorkbook workbook = new 
XSSFWorkbook(flowFile.getContentStream())) {
+            final Sheet sheet = workbook.getSheetAt(0);
+            assertEquals("Sheet1", sheet.getSheetName());
+
+            final List<XSSFHyperlink> hyperlinks = (List<XSSFHyperlink>) 
sheet.getHyperlinkList();
+            assertIterableEquals(
+                    List.of(
+                            "http://google.com/";,
+                            "https://apache.org/";,
+                            "https://en.wikipedia.org/";,
+                            "Sheet1",
+                            "http://twitter.com/#!/apacheorg";,
+                            "http://www.bailii.org/databases.html#ie";,
+                            "https://en.wikipedia.org/wiki/Apache_POI#See_also";
+                    ),
+                    hyperlinks.stream().map(XSSFHyperlink::getAddress).toList()
+            );
+        }
+    }
+
     private static void populateSheet(XSSFSheet sheet, Object[][] data) {
         int rowCount = 0;
         for (Object[] dataRow : data) {
diff --git 
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/resources/excel/hyperlinks.xlsx
 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/resources/excel/hyperlinks.xlsx
new file mode 100644
index 0000000000..bb161a0e72
Binary files /dev/null and 
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/resources/excel/hyperlinks.xlsx
 differ
diff --git a/nifi-extension-bundles/nifi-poi-bundle/pom.xml 
b/nifi-extension-bundles/nifi-poi-bundle/pom.xml
index c5f6d52fa5..fd8cdf4373 100644
--- a/nifi-extension-bundles/nifi-poi-bundle/pom.xml
+++ b/nifi-extension-bundles/nifi-poi-bundle/pom.xml
@@ -45,7 +45,7 @@
             <dependency>
                 <groupId>com.github.pjfanning</groupId>
                 <artifactId>excel-streaming-reader</artifactId>
-                <version>5.1.0</version>
+                <version>5.1.1</version>
             </dependency>
         </dependencies>
     </dependencyManagement>

Reply via email to