This is an automated email from the ASF dual-hosted git repository.
exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new 27522cf9e7 NIFI-14426 Added support for Binary XLS format to
ExcelReader (#9874)
27522cf9e7 is described below
commit 27522cf9e74659d1a752a9b9722911f1c8c801dd
Author: zhtk <[email protected]>
AuthorDate: Sat May 10 18:40:53 2025 +0200
NIFI-14426 Added support for Binary XLS format to ExcelReader (#9874)
Signed-off-by: David Handermann <[email protected]>
Co-authored-by: dan-s1 <[email protected]>
---
.../nifi/excel/ExcelHeaderSchemaStrategy.java | 2 +
.../java/org/apache/nifi/excel/ExcelReader.java | 17 ++-
.../org/apache/nifi/excel/ExcelRecordReader.java | 2 +-
.../nifi/excel/ExcelRecordReaderConfiguration.java | 12 ++
.../org/apache/nifi/excel/ExcelRecordSource.java | 2 +
.../java/org/apache/nifi/excel/InputFileType.java | 45 +++++++
.../java/org/apache/nifi/excel/RowIterator.java | 29 ++++-
.../apache/nifi/excel/TestExcelRecordReader.java | 142 ++++++++++++++-------
8 files changed, 191 insertions(+), 60 deletions(-)
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
index 5db05490d2..fc8073adf7 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelHeaderSchemaStrategy.java
@@ -74,10 +74,12 @@ public class ExcelHeaderSchemaStrategy implements
SchemaAccessStrategy {
final int firstRow = rawFirstRow == null ?
NumberUtils.toInt(ExcelReader.STARTING_ROW.getDefaultValue()) : rawFirstRow;
final int zeroBasedFirstRow = ExcelReader.getZeroBasedIndex(firstRow);
final String password =
context.getProperty(ExcelReader.PASSWORD).getValue();
+ final InputFileType inputFileType =
context.getProperty(ExcelReader.INPUT_FILE_TYPE).asAllowableValue(InputFileType.class);
final ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withRequiredSheets(requiredSheets)
.withFirstRow(zeroBasedFirstRow)
.withPassword(password)
+ .withInputFileType(inputFileType)
.build();
final RowIterator rowIterator = new RowIterator(contentStream,
configuration, logger);
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
index 428721e745..d36c7f7923 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelReader.java
@@ -53,13 +53,13 @@ import java.util.Collections;
import java.util.List;
import java.util.Map;
-@Tags({"excel", "spreadsheet", "xlsx", "parse", "record", "row", "reader",
"values", "cell"})
+@Tags({"excel", "spreadsheet", "xls", "xlsx", "parse", "record", "row",
"reader", "values", "cell"})
@CapabilityDescription("Parses a Microsoft Excel document returning each row
in each sheet as a separate record. "
+ "This reader allows for inferring a schema from all the required
sheets "
+ "or providing an explicit schema for interpreting the values."
+ "See Controller Service's Usage for further documentation. "
- + "This reader is currently only capable of processing .xlsx "
- + "(XSSF 2007 OOXML file format) Excel documents and not older .xls
(HSSF '97(-2007) file format) documents.")
+ + "This reader is capable of processing both password and non password
protected .xlsx (XSSF 2007 OOXML file format) "
+ + "and older .xls (HSSF '97(-2007) file format) Excel documents.")
public class ExcelReader extends SchemaRegistryService implements
RecordReaderFactory {
public static final PropertyDescriptor REQUIRED_SHEETS = new
PropertyDescriptor
@@ -85,6 +85,15 @@ public class ExcelReader extends SchemaRegistryService
implements RecordReaderFa
.addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR)
.build();
+ public static final PropertyDescriptor INPUT_FILE_TYPE = new
PropertyDescriptor
+ .Builder().name("Input File Type")
+ .displayName("Input File Type")
+ .description("Specifies type of Excel input file.")
+ .required(true)
+ .allowableValues(InputFileType.class)
+ .defaultValue(InputFileType.XLSX)
+ .build();
+
public static final PropertyDescriptor PROTECTION_TYPE = new
PropertyDescriptor
.Builder().name("Protection Type")
.displayName("Protection Type")
@@ -128,6 +137,7 @@ public class ExcelReader extends SchemaRegistryService
implements RecordReaderFa
final List<String> requiredSheets = getRequiredSheets(variables);
final int firstRow = getStartingRow(variables);
final String password =
configurationContext.getProperty(PASSWORD).getValue();
+ final InputFileType inputFileType =
configurationContext.getProperty(INPUT_FILE_TYPE).asAllowableValue(InputFileType.class);
final ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withDateFormat(dateFormat)
.withRequiredSheets(requiredSheets)
@@ -136,6 +146,7 @@ public class ExcelReader extends SchemaRegistryService
implements RecordReaderFa
.withTimeFormat(timeFormat)
.withTimestampFormat(timestampFormat)
.withPassword(password)
+ .withInputFileType(inputFileType)
.build();
return new ExcelRecordReader(configuration, in, logger);
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordReader.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordReader.java
index ea45a2adca..d055a1f251 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordReader.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordReader.java
@@ -71,7 +71,7 @@ public class ExcelRecordReader implements RecordReader {
try {
this.rowIterator = new RowIterator(inputStream, configuration,
logger);
} catch (RuntimeException e) {
- throw new MalformedRecordException("Read initial Record from Excel
XLSX failed", e);
+ throw new MalformedRecordException("Read initial Record from Excel
file failed", e);
}
}
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordReaderConfiguration.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordReaderConfiguration.java
index 6faa826ad5..f3fd777ec1 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordReaderConfiguration.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordReaderConfiguration.java
@@ -30,6 +30,7 @@ public class ExcelRecordReaderConfiguration {
private String timestampFormat;
private String password;
private boolean avoidTempFiles;
+ private InputFileType inputFileType;
private ExcelRecordReaderConfiguration() {
}
@@ -66,6 +67,10 @@ public class ExcelRecordReaderConfiguration {
return avoidTempFiles;
}
+ public InputFileType getInputFileType() {
+ return inputFileType;
+ }
+
public static final class Builder {
private RecordSchema schema;
private List<String> requiredSheets;
@@ -75,6 +80,7 @@ public class ExcelRecordReaderConfiguration {
private String timestampFormat;
private String password;
private boolean avoidTempFiles;
+ private InputFileType inputFileType;
public Builder withSchema(RecordSchema schema) {
this.schema = schema;
@@ -116,6 +122,11 @@ public class ExcelRecordReaderConfiguration {
return this;
}
+ public Builder withInputFileType(InputFileType inputFileType) {
+ this.inputFileType = inputFileType;
+ return this;
+ }
+
public ExcelRecordReaderConfiguration build() {
ExcelRecordReaderConfiguration excelRecordReaderConfiguration =
new ExcelRecordReaderConfiguration();
excelRecordReaderConfiguration.schema = this.schema;
@@ -126,6 +137,7 @@ public class ExcelRecordReaderConfiguration {
excelRecordReaderConfiguration.firstRow = this.firstRow;
excelRecordReaderConfiguration.password = password;
excelRecordReaderConfiguration.avoidTempFiles = avoidTempFiles;
+ excelRecordReaderConfiguration.inputFileType = inputFileType;
return excelRecordReaderConfiguration;
}
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordSource.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordSource.java
index 24f4a9fcff..0c98f6e0a0 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordSource.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/ExcelRecordSource.java
@@ -36,10 +36,12 @@ public class ExcelRecordSource implements RecordSource<Row>
{
final int firstRow = rawFirstRow == null ?
NumberUtils.toInt(ExcelReader.STARTING_ROW.getDefaultValue()) : rawFirstRow;
final int zeroBasedFirstRow = ExcelReader.getZeroBasedIndex(firstRow);
final String password =
context.getProperty(ExcelReader.PASSWORD).getValue();
+ final InputFileType inputFileType =
context.getProperty(ExcelReader.INPUT_FILE_TYPE).asAllowableValue(InputFileType.class);
final ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withRequiredSheets(requiredSheets)
.withFirstRow(zeroBasedFirstRow)
.withPassword(password)
+ .withInputFileType(inputFileType)
.build();
this.rowIterator = new RowIterator(in, configuration, logger);
}
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/InputFileType.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/InputFileType.java
new file mode 100644
index 0000000000..9f5d4df000
--- /dev/null
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/InputFileType.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nifi.excel;
+
+import org.apache.nifi.components.DescribedValue;
+
+public enum InputFileType implements DescribedValue {
+ XLS("XLS 1997-2007 file format"),
+ XLSX("XLSX 2007-present OOXML file format");
+
+ InputFileType(String description) {
+ this.description = description;
+ }
+
+ private final String description;
+
+ @Override
+ public String getValue() {
+ return name();
+ }
+
+ @Override
+ public String getDisplayName() {
+ return name();
+ }
+
+ @Override
+ public String getDescription() {
+ return description;
+ }
+}
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowIterator.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowIterator.java
index 6d067a7c07..d4172be8ad 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowIterator.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/main/java/org/apache/nifi/excel/RowIterator.java
@@ -19,6 +19,8 @@ package org.apache.nifi.excel;
import com.github.pjfanning.xlsx.StreamingReader;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.exception.ProcessException;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
@@ -42,13 +44,26 @@ class RowIterator implements Iterator<Row>, Closeable {
private Row currentRow;
RowIterator(final InputStream in, final ExcelRecordReaderConfiguration
configuration, final ComponentLog logger) {
- this.workbook = StreamingReader.builder()
- .rowCacheSize(100)
- .bufferSize(4096)
- .password(configuration.getPassword())
- .setAvoidTempFiles(configuration.isAvoidTempFiles())
- .setReadSharedFormulas(true) // NOTE: If not set to true, then
data with shared formulas fail.
- .open(in);
+ if (configuration.getInputFileType() == InputFileType.XLSX) {
+ this.workbook = StreamingReader.builder()
+ .rowCacheSize(100)
+ .bufferSize(4096)
+ .password(configuration.getPassword())
+ .setAvoidTempFiles(configuration.isAvoidTempFiles())
+ .setReadSharedFormulas(true) // NOTE: If not set to true,
then data with shared formulas fail.
+ .open(in);
+ } else {
+ // Providing the password to the HSSFWorkbook is done by setting a
thread variable managed by
+ // Biff8EncryptionKey. After the workbook is created, the thread
variable can be cleared.
+
Biff8EncryptionKey.setCurrentUserPassword(configuration.getPassword());
+ try {
+ this.workbook = new HSSFWorkbook(in);
+ } catch (final IOException e) {
+ throw new ProcessException("Failed to open XLS file", e);
+ } finally {
+ Biff8EncryptionKey.setCurrentUserPassword(null);
+ }
+ }
final List<String> requiredSheets = configuration.getRequiredSheets();
if (requiredSheets == null || requiredSheets.isEmpty()) {
diff --git
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelRecordReader.java
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelRecordReader.java
index 14f7015a15..4b33c2e238 100644
---
a/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelRecordReader.java
+++
b/nifi-extension-bundles/nifi-poi-bundle/nifi-poi-services/src/test/java/org/apache/nifi/excel/TestExcelRecordReader.java
@@ -16,7 +16,6 @@
*/
package org.apache.nifi.excel;
-import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.serialization.MalformedRecordException;
@@ -25,7 +24,8 @@ import org.apache.nifi.serialization.record.Record;
import org.apache.nifi.serialization.record.RecordField;
import org.apache.nifi.serialization.record.RecordFieldType;
import org.apache.nifi.serialization.record.RecordSchema;
-import com.github.pjfanning.xlsx.exceptions.ReadException;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.crypt.EncryptionMode;
@@ -33,13 +33,15 @@ import org.apache.poi.poifs.crypt.Encryptor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
-import org.apache.poi.xssf.usermodel.XSSFSheet;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
import org.junit.jupiter.params.provider.ValueSource;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
@@ -53,9 +55,9 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.Timestamp;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collections;
import java.util.List;
+import java.util.function.Supplier;
import static java.nio.file.Files.newDirectoryStream;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
@@ -71,6 +73,7 @@ public class TestExcelRecordReader {
private static final String MULTI_SHEET_FILE = "twoSheets.xlsx";
private static final String PASSWORD = "nifi";
private static final ByteArrayOutputStream PASSWORD_PROTECTED = new
ByteArrayOutputStream();
+ private static final ByteArrayOutputStream PASSWORD_PROTECTED_OLDER_EXCEL
= new ByteArrayOutputStream();
private static final Object[][] DATA = {
{"ID", "Name"},
{1, "Manny"},
@@ -98,21 +101,8 @@ public class TestExcelRecordReader {
@BeforeAll
static void setUpBeforeAll() throws Exception {
- //Generate an Excel file and populate it with data
- final InputStream workbook = createWorkbook(DATA);
-
- //Protect the Excel file with a password
- try (POIFSFileSystem poifsFileSystem = new POIFSFileSystem()) {
- EncryptionInfo encryptionInfo = new
EncryptionInfo(EncryptionMode.agile);
- Encryptor encryptor = encryptionInfo.getEncryptor();
- encryptor.confirmPassword(PASSWORD);
-
- try (OPCPackage opc = OPCPackage.open(workbook);
- OutputStream os = encryptor.getDataStream(poifsFileSystem)) {
- opc.save(os);
- }
- poifsFileSystem.writeFilesystem(PASSWORD_PROTECTED);
- }
+ createPasswordProtectedWorkbook();
+ createPasswordProtectedOlderExcelWorkbook();
}
@Test
@@ -122,20 +112,32 @@ public class TestExcelRecordReader {
MalformedRecordException mre =
assertThrows(MalformedRecordException.class, () -> new
ExcelRecordReader(configuration, getInputStream("notExcel.txt"), logger));
final Throwable cause = mre.getCause();
- assertInstanceOf(ReadException.class, cause);
+ assertInstanceOf(ProcessException.class, cause);
}
@Test
- public void testOlderExcelFormatFile() {
- ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder().build();
- MalformedRecordException mre =
assertThrows(MalformedRecordException.class, () -> new
ExcelRecordReader(configuration, getInputStream("olderFormat.xls"), logger));
- assertTrue(ExceptionUtils.getStackTrace(mre).contains("data appears to
be in the OLE2 Format"));
+ public void testOlderExcelFormatFile() throws MalformedRecordException {
+ final List<RecordField> fields = List.of(
+ new RecordField("A", RecordFieldType.STRING.getDataType()),
+ new RecordField("B", RecordFieldType.STRING.getDataType()),
+ new RecordField("C", RecordFieldType.STRING.getDataType()));
+
+ ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
+ .withSchema(new SimpleRecordSchema(fields))
+ .withInputFileType(InputFileType.XLS)
+ .build();
+
+ final ExcelRecordReader recordReader = new
ExcelRecordReader(configuration, getInputStream("olderFormat.xls"), logger);
+ final List<Record> records = getRecords(recordReader, false, false);
+
+ assertEquals(4, records.size());
}
@Test
public void testMultipleRecordsSingleSheet() throws
MalformedRecordException {
ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withSchema(getDataFormattingSchema())
+ .withInputFileType(InputFileType.XLSX)
.build();
ExcelRecordReader recordReader = new ExcelRecordReader(configuration,
getInputStream(DATA_FORMATTING_FILE), logger);
@@ -145,7 +147,7 @@ public class TestExcelRecordReader {
}
private RecordSchema getDataFormattingSchema() {
- final List<RecordField> fields = Arrays.asList(
+ final List<RecordField> fields = List.of(
new RecordField("Numbers",
RecordFieldType.DOUBLE.getDataType()),
new RecordField("Timestamps",
RecordFieldType.DATE.getDataType()),
new RecordField("Money", RecordFieldType.DOUBLE.getDataType()),
@@ -176,12 +178,13 @@ public class TestExcelRecordReader {
@ParameterizedTest
@ValueSource(booleans = {true, false})
public void testDropUnknownFields(boolean dropUnknownFields) throws
MalformedRecordException {
- final List<RecordField> fields = Arrays.asList(
+ final List<RecordField> fields = List.of(
new RecordField("Numbers",
RecordFieldType.DOUBLE.getDataType()),
new RecordField("Timestamps",
RecordFieldType.DATE.getDataType()));
ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withSchema(new SimpleRecordSchema(fields))
+ .withInputFileType(InputFileType.XLSX)
.build();
ExcelRecordReader recordReader = new ExcelRecordReader(configuration,
getInputStream(DATA_FORMATTING_FILE), logger);
@@ -203,6 +206,7 @@ public class TestExcelRecordReader {
ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withFirstRow(5)
.withSchema(getDataFormattingSchema())
+ .withInputFileType(InputFileType.XLSX)
.build();
ExcelRecordReader recordReader = new ExcelRecordReader(configuration,
getInputStream(DATA_FORMATTING_FILE), logger);
@@ -221,6 +225,7 @@ public class TestExcelRecordReader {
.withTimeFormat(RecordFieldType.TIME.getDefaultFormat())
.withTimestampFormat(RecordFieldType.TIMESTAMP.getDefaultFormat())
.withSchema(schema)
+ .withInputFileType(InputFileType.XLSX)
.build();
ExcelRecordReader recordReader = new ExcelRecordReader(configuration,
getInputStream("dates.xlsx"), logger);
@@ -238,6 +243,7 @@ public class TestExcelRecordReader {
.withSchema(schema)
.withFirstRow(1)
.withRequiredSheets(requiredSheets)
+ .withInputFileType(InputFileType.XLSX)
.build();
ExcelRecordReader recordReader = new ExcelRecordReader(configuration,
getInputStream(MULTI_SHEET_FILE), logger);
@@ -247,7 +253,7 @@ public class TestExcelRecordReader {
}
private RecordSchema getSpecificSheetSchema() {
- return new SimpleRecordSchema(Arrays.asList(new RecordField("first",
RecordFieldType.STRING.getDataType()),
+ return new SimpleRecordSchema(List.of(new RecordField("first",
RecordFieldType.STRING.getDataType()),
new RecordField("second",
RecordFieldType.STRING.getDataType()),
new RecordField("third",
RecordFieldType.STRING.getDataType())));
}
@@ -260,6 +266,7 @@ public class TestExcelRecordReader {
.withSchema(schema)
.withFirstRow(1)
.withRequiredSheets(requiredSheets)
+ .withInputFileType(InputFileType.XLSX)
.build();
MalformedRecordException mre =
assertThrows(MalformedRecordException.class,
@@ -270,10 +277,11 @@ public class TestExcelRecordReader {
@Test
public void testSelectAllSheets() throws MalformedRecordException {
- RecordSchema schema = new SimpleRecordSchema(Arrays.asList(new
RecordField("first", RecordFieldType.STRING.getDataType()),
+ RecordSchema schema = new SimpleRecordSchema(List.of(new
RecordField("first", RecordFieldType.STRING.getDataType()),
new RecordField("second",
RecordFieldType.STRING.getDataType())));
ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withSchema(schema)
+ .withInputFileType(InputFileType.XLSX)
.build();
ExcelRecordReader recordReader = new ExcelRecordReader(configuration,
getInputStream(MULTI_SHEET_FILE), logger);
@@ -284,13 +292,14 @@ public class TestExcelRecordReader {
@Test
void testWhereCellValueDoesNotMatchSchemaType() {
- RecordSchema schema = new SimpleRecordSchema(Arrays.asList(new
RecordField("first", RecordFieldType.STRING.getDataType()),
+ RecordSchema schema = new SimpleRecordSchema(List.of(new
RecordField("first", RecordFieldType.STRING.getDataType()),
new RecordField("second",
RecordFieldType.FLOAT.getDataType())));
List<String> requiredSheets = Collections.singletonList("TestSheetA");
ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withSchema(schema)
.withFirstRow(2)
.withRequiredSheets(requiredSheets)
+ .withInputFileType(InputFileType.XLSX)
.build();
final MalformedRecordException mre =
assertThrows(MalformedRecordException.class, () -> {
@@ -302,48 +311,55 @@ public class TestExcelRecordReader {
assertTrue(mre.getMessage().contains("on row") &&
mre.getMessage().contains("in sheet"));
}
- @Test
- void testPasswordProtected() throws Exception {
- RecordSchema schema = getPasswordProtectedSchema();
- ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
+ @ParameterizedTest
+ @EnumSource(InputFileType.class)
+ void testPasswordProtected(InputFileType inputFileType) throws Exception {
+ final RecordSchema schema = getPasswordProtectedSchema();
+ final ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withSchema(schema)
.withPassword(PASSWORD)
.withAvoidTempFiles(true)
+ .withInputFileType(inputFileType)
.build();
- InputStream inputStream = new
ByteArrayInputStream(PASSWORD_PROTECTED.toByteArray());
- ExcelRecordReader recordReader = new ExcelRecordReader(configuration,
inputStream, logger);
- List<Record> records = getRecords(recordReader, false, false);
+ final ByteArrayOutputStream fileStream = inputFileType ==
InputFileType.XLSX ? PASSWORD_PROTECTED : PASSWORD_PROTECTED_OLDER_EXCEL;
+ final InputStream inputStream = new
ByteArrayInputStream(fileStream.toByteArray());
+ final ExcelRecordReader recordReader = new
ExcelRecordReader(configuration, inputStream, logger);
+ final List<Record> records = getRecords(recordReader, false, false);
assertEquals(DATA.length, records.size());
}
- @Test
- void testPasswordProtectedWithoutPassword() {
- RecordSchema schema = getPasswordProtectedSchema();
- ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
+ @ParameterizedTest
+ @EnumSource(InputFileType.class)
+ void testPasswordProtectedWithoutPassword(InputFileType inputFileType) {
+ final RecordSchema schema = getPasswordProtectedSchema();
+ final ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withSchema(schema)
+ .withInputFileType(inputFileType)
.build();
- InputStream inputStream = new
ByteArrayInputStream(PASSWORD_PROTECTED.toByteArray());
+ final ByteArrayOutputStream fileStream = inputFileType ==
InputFileType.XLSX ? PASSWORD_PROTECTED : PASSWORD_PROTECTED_OLDER_EXCEL;
+ final InputStream inputStream = new
ByteArrayInputStream(fileStream.toByteArray());
assertThrows(Exception.class, () -> new
ExcelRecordReader(configuration, inputStream, logger));
}
private RecordSchema getPasswordProtectedSchema() {
- return new SimpleRecordSchema(Arrays.asList(new RecordField("id",
RecordFieldType.INT.getDataType()),
+ return new SimpleRecordSchema(List.of(new RecordField("id",
RecordFieldType.INT.getDataType()),
new RecordField("name",
RecordFieldType.STRING.getDataType())));
}
@Test
void testWithNumberColumnWhoseValueIsEmptyString() throws Exception {
- final RecordSchema schema = new SimpleRecordSchema(Arrays.asList(new
RecordField("first", RecordFieldType.STRING.getDataType()),
+ final RecordSchema schema = new SimpleRecordSchema(List.of(new
RecordField("first", RecordFieldType.STRING.getDataType()),
new RecordField("second",
RecordFieldType.LONG.getDataType())));
final ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withSchema(schema)
+ .withInputFileType(InputFileType.XLSX)
.build();
final Object[][] data = {{"Manny", ""}};
- final InputStream workbook = createWorkbook(data);
+ final InputStream workbook = createWorkbook(data, XSSFWorkbook::new);
final ExcelRecordReader recordReader = new
ExcelRecordReader(configuration, workbook, logger);
assertDoesNotThrow(() -> getRecords(recordReader, true, true));
@@ -355,9 +371,10 @@ public class TestExcelRecordReader {
final RecordSchema schema = new SimpleRecordSchema(List.of(new
RecordField(fieldName, RecordFieldType.STRING.getDataType())));
final ExcelRecordReaderConfiguration configuration = new
ExcelRecordReaderConfiguration.Builder()
.withSchema(schema)
+ .withInputFileType(InputFileType.XLSX)
.build();
final Object[][] data = {{9876543210L}};
- final InputStream workbook = createWorkbook(data);
+ final InputStream workbook = createWorkbook(data, XSSFWorkbook::new);
final ExcelRecordReader recordReader = new
ExcelRecordReader(configuration, workbook, logger);
final List<Record> records = getRecords(recordReader, true, true);
final Record firstRecord = records.getFirst();
@@ -366,10 +383,10 @@ public class TestExcelRecordReader {
assertEquals(scientificNotationNumber,
firstRecord.getAsString(fieldName));
}
- private static InputStream createWorkbook(Object[][] data) throws
Exception {
+ private static InputStream createWorkbook(Object[][] data,
Supplier<Workbook> workbookSupplier) throws Exception {
final ByteArrayOutputStream workbookOutputStream = new
ByteArrayOutputStream();
- try (XSSFWorkbook workbook = new XSSFWorkbook()) {
- final XSSFSheet sheet = workbook.createSheet("SomeSheetName");
+ try (Workbook workbook = workbookSupplier.get()) {
+ final Sheet sheet = workbook.createSheet("SomeSheetName");
populateSheet(sheet, data);
workbook.write(workbookOutputStream);
}
@@ -377,7 +394,34 @@ public class TestExcelRecordReader {
return new ByteArrayInputStream(workbookOutputStream.toByteArray());
}
- private static void populateSheet(XSSFSheet sheet, Object[][] data) {
+ private static void createPasswordProtectedWorkbook() throws Exception {
+ // Generate an Excel file and populate it with data
+ final InputStream workbook = createWorkbook(DATA, XSSFWorkbook::new);
+
+ // Protect the Excel file with a password
+ try (POIFSFileSystem poifsFileSystem = new POIFSFileSystem()) {
+ EncryptionInfo encryptionInfo = new
EncryptionInfo(EncryptionMode.agile);
+ Encryptor encryptor = encryptionInfo.getEncryptor();
+ encryptor.confirmPassword(PASSWORD);
+
+ try (OPCPackage opc = OPCPackage.open(workbook);
+ OutputStream os = encryptor.getDataStream(poifsFileSystem)) {
+ opc.save(os);
+ }
+ poifsFileSystem.writeFilesystem(PASSWORD_PROTECTED);
+ }
+ }
+
+ private static void createPasswordProtectedOlderExcelWorkbook() throws
Exception {
+ Biff8EncryptionKey.setCurrentUserPassword(PASSWORD);
+ try (final InputStream inputStream = createWorkbook(DATA,
HSSFWorkbook::new)) {
+ inputStream.transferTo(PASSWORD_PROTECTED_OLDER_EXCEL);
+ } finally {
+ Biff8EncryptionKey.setCurrentUserPassword(null);
+ }
+ }
+
+ private static void populateSheet(Sheet sheet, Object[][] data) {
//Adding the data to the Excel worksheet
int rowCount = 0;
for (Object[] dataRow : data) {