This is an automated email from the ASF dual-hosted git repository.
exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new aba59357b0 NIFI-14702 Switched to Streaming Workbook in SplitExcel
(#10058)
aba59357b0 is described below
commit aba59357b0118b77d4dc5f2d2591d55a741bdd48
Author: zhtk <[email protected]>
AuthorDate: Sat Jul 19 23:26:38 2025 +0200
NIFI-14702 Switched to Streaming Workbook in SplitExcel (#10058)
Co-authored-by: Kamilkime <[email protected]>
Signed-off-by: David Handermann <[email protected]>
---
.../apache/nifi/processors/excel/SplitExcel.java | 59 +++++++++++++++------
.../nifi/processors/excel/TestSplitExcel.java | 34 ++++++++++++
.../src/test/resources/excel/hyperlinks.xlsx | Bin 0 -> 10756 bytes
nifi-extension-bundles/nifi-poi-bundle/pom.xml | 2 +-
4 files changed, 77 insertions(+), 18 deletions(-)
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
index d095edefd2..7270ec66cc 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/processors/excel/SplitExcel.java
@@ -18,6 +18,7 @@ package org.apache.nifi.processors.excel;
import com.github.pjfanning.xlsx.StreamingReader;
import com.github.pjfanning.xlsx.exceptions.ExcelRuntimeException;
+import com.github.pjfanning.xlsx.impl.XlsxHyperlink;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.SideEffectFree;
import org.apache.nifi.annotation.behavior.SupportsBatching;
@@ -35,12 +36,17 @@ import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellCopyContext;
import org.apache.poi.ss.usermodel.CellCopyPolicy;
+import org.apache.poi.ss.usermodel.Hyperlink;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.ss.util.CellRangeAddress;
+import org.apache.poi.ss.util.CellUtil;
+import org.apache.poi.xssf.streaming.SXSSFSheet;
+import org.apache.poi.xssf.streaming.SXSSFWorkbook;
import java.io.OutputStream;
import java.util.ArrayList;
@@ -125,9 +131,9 @@ public class SplitExcel extends AbstractProcessor {
.cellStyle(CellCopyPolicy.DEFAULT_COPY_CELL_STYLE_POLICY)
.cellValue(CellCopyPolicy.DEFAULT_COPY_CELL_VALUE_POLICY)
.condenseRows(CellCopyPolicy.DEFAULT_CONDENSE_ROWS_POLICY)
- .copyHyperlink(CellCopyPolicy.DEFAULT_COPY_HYPERLINK_POLICY)
+ .copyHyperlink(false) // NOTE: the hyperlinks appear at end of
sheet, so we need to iterate them separately at the end.
.mergeHyperlink(CellCopyPolicy.DEFAULT_MERGE_HYPERLINK_POLICY)
- .mergedRegions(CellCopyPolicy.DEFAULT_COPY_MERGED_REGIONS_POLICY)
+ .mergedRegions(false) // NOTE: set to false because of the
explicit merge region handling in the copyRows method.
.rowHeight(CellCopyPolicy.DEFAULT_COPY_ROW_HEIGHT_POLICY)
.build();
@@ -150,7 +156,6 @@ public class SplitExcel extends AbstractProcessor {
final ProtectionType protectionType =
context.getProperty(PROTECTION_TYPE).asAllowableValue(ProtectionType.class);
final String password = protectionType == ProtectionType.PASSWORD ?
context.getProperty(PASSWORD).getValue() : null;
-
final List<WorkbookSplit> workbookSplits = new ArrayList<>();
try {
@@ -167,21 +172,15 @@ public class SplitExcel extends AbstractProcessor {
int index = 0;
for (final Sheet originalSheet : originalWorkbook) {
final String originalSheetName =
originalSheet.getSheetName();
- try (XSSFWorkbook newWorkbook = new XSSFWorkbook()) {
- XSSFSheet newSheet =
newWorkbook.createSheet(originalSheetName);
- List<Row> originalRows = new ArrayList<>();
- for (Row originalRow : originalSheet) {
- originalRows.add(originalRow);
- }
- if (!originalRows.isEmpty()) {
- newSheet.copyRows(originalRows,
originalSheet.getFirstRowNum(), CELL_COPY_POLICY);
- }
+ try (final SXSSFWorkbook newWorkbook = new
SXSSFWorkbook(null, SXSSFWorkbook.DEFAULT_WINDOW_SIZE, false, true)) {
+ final SXSSFSheet newSheet =
newWorkbook.createSheet(originalSheetName);
+ final int numberOfCopiedRows = copyRows(originalSheet,
newSheet);
- FlowFile newFlowFile =
session.create(originalFlowFile);
+ final FlowFile newFlowFile =
session.create(originalFlowFile);
try (final OutputStream out =
session.write(newFlowFile)) {
newWorkbook.write(out);
- workbookSplits.add(new WorkbookSplit(index,
newFlowFile, originalSheetName, originalRows.size()));
+ workbookSplits.add(new WorkbookSplit(index,
newFlowFile, originalSheetName, numberOfCopiedRows));
}
}
@@ -230,6 +229,32 @@ public class SplitExcel extends AbstractProcessor {
session.transfer(flowFileSplits, REL_SPLIT);
}
- private record WorkbookSplit(int index, FlowFile content, String
sheetName, int numRows) {
+ private int copyRows(final Sheet originalSheet, final SXSSFSheet
destinationSheet) {
+ final CellCopyContext cellCopyContext = new CellCopyContext();
+ int rowCount = 0;
+
+ for (final Row sourceRow : originalSheet) {
+ final Row destinationRow =
destinationSheet.createRow(sourceRow.getRowNum());
+ destinationRow.setHeight(sourceRow.getHeight());
+
+ for (final Cell sourceCell : sourceRow) {
+ final Cell destCell =
destinationRow.createCell(sourceCell.getColumnIndex());
+ CellUtil.copyCell(sourceCell, destCell, CELL_COPY_POLICY,
cellCopyContext);
+ }
+
+ rowCount++;
+ }
+
+ for (final CellRangeAddress sourceRegion :
originalSheet.getMergedRegions()) {
+ destinationSheet.addMergedRegion(sourceRegion.copy());
+ }
+
+ for (final Hyperlink hyperlink : originalSheet.getHyperlinkList()) {
+ destinationSheet.addHyperlink(((XlsxHyperlink)
hyperlink).createXSSFHyperlink());
+ }
+
+ return rowCount;
}
+
+ private record WorkbookSplit(int index, FlowFile content, String
sheetName, int numRows) { }
}
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
index fdfe5ff64f..b0540da260 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/processors/excel/TestSplitExcel.java
@@ -26,6 +26,7 @@ import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.xssf.usermodel.XSSFHyperlink;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.jupiter.api.AfterAll;
@@ -53,6 +54,7 @@ import static
org.apache.nifi.flowfile.attributes.FragmentAttributes.FRAGMENT_ID
import static
org.apache.nifi.flowfile.attributes.FragmentAttributes.FRAGMENT_INDEX;
import static
org.apache.nifi.flowfile.attributes.FragmentAttributes.SEGMENT_ORIGINAL_FILENAME;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -239,6 +241,38 @@ public class TestSplitExcel {
}
}
+ @Test
+ void testHyperlinks() throws IOException {
+ final Path hyperlinksFile =
Paths.get("src/test/resources/excel/hyperlinks.xlsx");
+ runner.enqueue(hyperlinksFile);
+
+ runner.run();
+
+ runner.assertTransferCount(SplitExcel.REL_SPLIT, 1);
+ runner.assertTransferCount(SplitExcel.REL_ORIGINAL, 1);
+ runner.assertTransferCount(SplitExcel.REL_FAILURE, 0);
+
+ final MockFlowFile flowFile =
runner.getFlowFilesForRelationship(SplitExcel.REL_SPLIT).getFirst();
+ try (XSSFWorkbook workbook = new
XSSFWorkbook(flowFile.getContentStream())) {
+ final Sheet sheet = workbook.getSheetAt(0);
+ assertEquals("Sheet1", sheet.getSheetName());
+
+ final List<XSSFHyperlink> hyperlinks = (List<XSSFHyperlink>)
sheet.getHyperlinkList();
+ assertIterableEquals(
+ List.of(
+ "http://google.com/",
+ "https://apache.org/",
+ "https://en.wikipedia.org/",
+ "Sheet1",
+ "http://twitter.com/#!/apacheorg",
+ "http://www.bailii.org/databases.html#ie",
+ "https://en.wikipedia.org/wiki/Apache_POI#See_also"
+ ),
+ hyperlinks.stream().map(XSSFHyperlink::getAddress).toList()
+ );
+ }
+ }
+
private static void populateSheet(XSSFSheet sheet, Object[][] data) {
int rowCount = 0;
for (Object[] dataRow : data) {
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/resources/excel/hyperlinks.xlsx
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/resources/excel/hyperlinks.xlsx
new file mode 100644
index 0000000000..bb161a0e72
Binary files /dev/null and
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/resources/excel/hyperlinks.xlsx
differ
diff --git a/nifi-extension-bundles/nifi-poi-bundle/pom.xml
b/nifi-extension-bundles/nifi-poi-bundle/pom.xml
index c5f6d52fa5..fd8cdf4373 100644
--- a/nifi-extension-bundles/nifi-poi-bundle/pom.xml
+++ b/nifi-extension-bundles/nifi-poi-bundle/pom.xml
@@ -45,7 +45,7 @@
<dependency>
<groupId>com.github.pjfanning</groupId>
<artifactId>excel-streaming-reader</artifactId>
- <version>5.1.0</version>
+ <version>5.1.1</version>
</dependency>
</dependencies>
</dependencyManagement>