This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new d84386d DRILL-7912: Add Sheet Names to Excel Reader (#2211)
d84386d is described below
commit d84386ddafb5b9c5b029e4b80c0e12d2e70eb76a
Author: Charles S. Givre <[email protected]>
AuthorDate: Sun May 2 15:06:04 2021 -0400
DRILL-7912: Add Sheet Names to Excel Reader (#2211)
* Initial Commit
* Updated streaming reader to latest version
* Addressed Review Comments
---
contrib/format-excel/README.md | 1 +
contrib/format-excel/pom.xml | 2 +-
.../drill/exec/store/excel/ExcelBatchReader.java | 76 ++++++++++++++++++----
.../drill/exec/store/excel/TestExcelFormat.java | 17 +++++
4 files changed, 84 insertions(+), 12 deletions(-)
diff --git a/contrib/format-excel/README.md b/contrib/format-excel/README.md
index 1a59428..17bb73b 100644
--- a/contrib/format-excel/README.md
+++ b/contrib/format-excel/README.md
@@ -67,6 +67,7 @@ The fields are:
_created
_last_printed
_modified
+ _sheets
### Known Limitations:
diff --git a/contrib/format-excel/pom.xml b/contrib/format-excel/pom.xml
index 020b897..549cf65 100644
--- a/contrib/format-excel/pom.xml
+++ b/contrib/format-excel/pom.xml
@@ -67,7 +67,7 @@
<dependency>
<groupId>com.github.pjfanning</groupId>
<artifactId>excel-streaming-reader</artifactId>
- <version>3.0.3</version>
+ <version>3.0.4</version>
</dependency>
</dependencies>
<build>
diff --git
a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index d30f859..73d634c 100644
---
a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++
b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -23,6 +23,7 @@ import com.github.pjfanning.xlsx.impl.StreamingWorkbook;
import org.apache.drill.common.exceptions.CustomErrorContext;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.TypeProtos.DataMode;
import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework;
import
org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
@@ -116,6 +117,23 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
}
}
+ private enum IMPLICIT_LIST_COLUMN {
+ /**
+ * A list of the available sheets in the file.
+ */
+ SHEETS("_sheets");
+
+ private final String fieldName;
+
+ IMPLICIT_LIST_COLUMN(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ public String getFieldName() {
+ return fieldName;
+ }
+ }
+
private static final int ROW_CACHE_SIZE = 100;
private static final int BUFFER_SIZE = 4096;
@@ -129,6 +147,7 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
private List<ScalarWriter> columnWriters;
private List<CellWriter> cellWriterArray;
private List<ScalarWriter> metadataColumnWriters;
+ private ScalarWriter sheetNameWriter;
private Iterator<Row> rowIterator;
private RowSetLoader rowWriter;
private int totalColumnCount;
@@ -137,6 +156,7 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
private int recordCount;
private Map<String, String> stringMetadata;
private Map<String, Date> dateMetadata;
+ private Map<String, List<String>> listMetadata;
private CustomErrorContext errorContext;
@@ -217,9 +237,6 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
}
private void getColumnHeaders(SchemaBuilder builder) {
- //Get the field names
- int columnCount;
-
// Case for empty sheet
if (sheet.getLastRowNum() == 0) {
builder.buildSchema();
@@ -233,7 +250,7 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
// Get the number of columns.
// This menthod also advances the row reader to the location of the first
row of data
- columnCount = getColumnCount();
+ setFirstRow();
excelFieldNames = new ArrayList<>();
cellWriterArray = new ArrayList<>();
@@ -287,7 +304,7 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
excelFieldNames.add(colPosition, tempColumnName);
break;
case FORMULA:
- case NUMERIC:
+ case NUMERIC:
case _NONE:
case BLANK:
tempColumnName = cell.getStringCellValue();
@@ -325,11 +342,10 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
}
/**
- * Returns the column count. There are a few gotchas here in that we have
to know the header row and count the physical number of cells
+ * There are a few gotchas here in that we have to know the header row and
count the physical number of cells
* in that row. This function also has to move the rowIterator object to
the first row of data.
- * @return The number of actual columns
*/
- private int getColumnCount() {
+ private void setFirstRow() {
// Initialize
currentRow = rowIterator.next();
int rowNumber = readerConfig.headerRow > 0 ? sheet.getFirstRowNum() : 0;
@@ -339,8 +355,6 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
for (int i = 1; i < rowNumber; i++) {
currentRow = rowIterator.next();
}
-
- return currentRow.getPhysicalNumberOfCells();
}
@Override
@@ -417,6 +431,7 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
stringMetadata = new HashMap<>();
dateMetadata = new HashMap<>();
+ listMetadata = new HashMap<>();
// Populate String metadata columns
stringMetadata.put(IMPLICIT_STRING_COLUMN.CATEGORY.getFieldName(),
fileMetadata.getCategory());
@@ -435,6 +450,9 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
dateMetadata.put(IMPLICIT_TIMESTAMP_COLUMN.CREATED.getFieldName(),
fileMetadata.getCreated());
dateMetadata.put(IMPLICIT_TIMESTAMP_COLUMN.LAST_PRINTED.getFieldName(),
fileMetadata.getLastPrinted());
dateMetadata.put(IMPLICIT_TIMESTAMP_COLUMN.MODIFIED.getFieldName(),
fileMetadata.getModified());
+
+ // Populate List columns
+ listMetadata.put(IMPLICIT_LIST_COLUMN.SHEETS.getFieldName(),
getSheetNames());
}
/**
@@ -485,9 +503,14 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
for (IMPLICIT_TIMESTAMP_COLUMN name : IMPLICIT_TIMESTAMP_COLUMN.values()) {
makeColumn(builder, name.getFieldName(), MinorType.TIMESTAMP);
}
+
+ // Add List Column Names
+ for (IMPLICIT_LIST_COLUMN name : IMPLICIT_LIST_COLUMN.values()) {
+ makeColumn(builder, name.getFieldName(), MinorType.LIST);
+ }
}
- private void makeColumn(SchemaBuilder builder, String name,
TypeProtos.MinorType type) {
+ private void makeColumn(SchemaBuilder builder, String name, MinorType type) {
// Verify supported types
switch (type) {
// The Excel Reader only Supports Strings, Floats and Date/Times
@@ -498,6 +521,9 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
case TIME:
builder.addNullable(name, type);
break;
+ case LIST:
+ builder.addArray(name, MinorType.VARCHAR);
+ break;
default:
throw UserException
.validationError()
@@ -566,6 +592,34 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
metadataColumnWriters.get(index).setTimestamp(Instant.ofEpochMilli(timeValue.getTime()));
}
}
+
+ // Write the sheet names. Since this is the only list field
+ int listIndex = IMPLICIT_STRING_COLUMN.values().length +
IMPLICIT_TIMESTAMP_COLUMN.values().length;
+ String sheetColumnName = IMPLICIT_LIST_COLUMN.SHEETS.fieldName;
+ List<String> sheetNames = listMetadata.get(sheetColumnName);
+
+ if (sheetNameWriter == null) {
+ int sheetColumnIndex =
rowWriter.tupleSchema().index(IMPLICIT_LIST_COLUMN.SHEETS.getFieldName());
+ if (sheetColumnIndex == -1) {
+ ColumnMetadata colSchema = MetadataUtils.newScalar(sheetColumnName,
MinorType.VARCHAR, DataMode.REPEATED);
+ colSchema.setBooleanProperty(ColumnMetadata.EXCLUDE_FROM_WILDCARD,
true);
+ listIndex = rowWriter.addColumn(colSchema);
+ }
+ sheetNameWriter = rowWriter.column(listIndex).array().scalar();
+ }
+
+ for (String sheetName : sheetNames) {
+ sheetNameWriter.setString(sheetName);
+ }
+ }
+
+ private List<String> getSheetNames() {
+ List<String> sheets = new ArrayList<>();
+ int sheetCount = streamingWorkbook.getNumberOfSheets();
+ for (int i = 0; i < sheetCount; i++) {
+ sheets.add(streamingWorkbook.getSheetName(i));
+ }
+ return sheets;
}
@Override
diff --git
a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index b79108b..a1521e3 100644
---
a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++
b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -38,6 +38,7 @@ import org.junit.experimental.categories.Category;
import java.nio.file.Paths;
import static org.apache.drill.test.QueryTestUtil.generateCompressedFile;
+import static org.apache.drill.test.rowSet.RowSetUtilities.strArray;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -513,4 +514,20 @@ public class TestExcelFormat extends ClusterTest {
new RowSetComparison(expected).verifyAndClearAll(results);
}
+
+ @Test
+ public void testGetSheetNames() throws RpcException {
+ String sql = "SELECT _sheets FROM dfs.`excel/test_data.xlsx` LIMIT 1";
+
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addArray("_sheets", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow((Object)strArray("data", "secondSheet", "thirdSheet",
"fourthSheet", "emptySheet", "missingDataSheet", "inconsistentData", "comps"))
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
}