This is an automated email from the ASF dual-hosted git repository.
cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git
The following commit(s) were added to refs/heads/master by this push:
new bd0458e DRILL-8106: do not use cellIterator because it skips empty
cells (#2426)
bd0458e is described below
commit bd0458e027bc90c1332fa1d5376d9616b720213a
Author: PJ Fanning <[email protected]>
AuthorDate: Thu Jan 13 21:10:00 2022 +0100
DRILL-8106: do not use cellIterator because it skips empty cells (#2426)
* [DRILL-8106] do not use cellIterator because it skips empty cells
* Update ExcelBatchReader.java
* Update ExcelBatchReader.java
* Update ExcelBatchReader.java
* Update ExcelBatchReader.java
* Create missing_data.xlsx
* Update TestExcelFormat.java
* Update TestExcelFormat.java
---
.../drill/exec/store/excel/ExcelBatchReader.java | 35 +++++++--------------
.../drill/exec/store/excel/TestExcelFormat.java | 30 ++++++++++++++++++
.../src/test/resources/excel/missing_data.xlsx | Bin 0 -> 10235 bytes
3 files changed, 41 insertions(+), 24 deletions(-)
diff --git
a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index feabf3c..f167c6d 100644
---
a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++
b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -58,7 +58,6 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import java.util.NoSuchElementException;
public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
@@ -312,41 +311,31 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
//If there are no headers, create columns names of field_n
if (readerConfig.headerRow == -1) {
String missingFieldName;
- int i = 0;
- for (Cell c : currentRow) {
- missingFieldName = MISSING_FIELD_NAME_HEADER + (i + 1);
+ for (short colNum = 0; colNum < currentRow.getLastCellNum(); colNum++) {
+ missingFieldName = MISSING_FIELD_NAME_HEADER + (colNum + 1);
makeColumn(builder, missingFieldName, MinorType.VARCHAR);
- excelFieldNames.add(i, missingFieldName);
- i++;
+ excelFieldNames.add(colNum, missingFieldName);
}
builder.buildSchema();
} else if (rowIterator.hasNext()) {
//Get the header row and column count
totalColumnCount = currentRow.getLastCellNum();
- Cell dataCell = null;
//Read the header row
- Iterator<Cell> headerRowIterator = currentRow.cellIterator();
- int colPosition = 0;
+ Row headerRow = currentRow;
String tempColumnName;
// Get the first data row.
currentRow = rowIterator.next();
Row firstDataRow = currentRow;
- Iterator<Cell> dataRowIterator = firstDataRow.cellIterator();
-
- while (headerRowIterator.hasNext()) {
+ for (short colPosition = 0; colPosition < totalColumnCount;
colPosition++) {
// We need this to get the header names
- Cell cell = headerRowIterator.next();
+ Cell cell = headerRow.getCell(colPosition,
Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
// Since header names are most likely all Strings, we need the first
row of actual data to get the data types
- try {
- dataCell = dataRowIterator.next();
- } catch (NoSuchElementException e) {
- // Do nothing... empty value in data cell
- }
+ Cell dataCell = firstDataRow.getCell(colPosition,
Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
switch (dataCell.getCellType()) {
case STRING:
@@ -360,10 +349,7 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
makeColumn(builder, tempColumnName, MinorType.VARCHAR);
excelFieldNames.add(colPosition, tempColumnName);
break;
- case FORMULA:
- case NUMERIC:
- case _NONE:
- case BLANK:
+ default:
tempColumnName = cell.getStringCellValue();
// Remove leading and trailing whitespace
@@ -372,7 +358,6 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
excelFieldNames.add(colPosition, tempColumnName);
break;
}
- colPosition++;
}
}
addMetadataToSchema(builder);
@@ -484,7 +469,9 @@ public class ExcelBatchReader implements
ManagedReader<FileSchemaNegotiator> {
Cell cell = currentRow.getCell(colPosition);
populateColumnArray(cell, colPosition);
- cellWriterArray.get(colWriterIndex).load(cell);
+ if (colWriterIndex < cellWriterArray.size()) {
+ cellWriterArray.get(colWriterIndex).load(cell);
+ }
colPosition++;
}
diff --git
a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index a8caba1..e9304e2 100644
---
a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++
b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -704,4 +704,34 @@ public class TestExcelFormat extends ClusterTest {
new RowSetComparison(expected).verifyAndClearAll(results);
}
+ @Test
+ public void testMissingData() throws RpcException {
+ String sql = "SELECT * FROM dfs.`excel/missing_data.xlsx`";
+
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("original_id", MinorType.VARCHAR)
+ .addNullable("original_nameFirst", MinorType.VARCHAR)
+ .addNullable("original_nameLast", MinorType.VARCHAR)
+ .addNullable("original_emailWork", MinorType.VARCHAR)
+ .addNullable("original_cityHome", MinorType.VARCHAR)
+ .addNullable("original_zipcodeHome", MinorType.FLOAT8)
+ .addNullable("original_countryHome", MinorType.VARCHAR)
+ .addNullable("original_birthday", MinorType.TIMESTAMP)
+ .addNullable("original_stateHome", MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow("XXXX00000001", "James", "Kushner", null, null, 10235, "US",
LocalDate.parse("1957-04-18").atStartOfDay().toInstant(ZoneOffset.UTC), "NY")
+ .addRow("XXXX00000002", "Steve", "Hecht", null, null, 11213, "US",
LocalDate.parse("1982-08-10").atStartOfDay().toInstant(ZoneOffset.UTC), "NY")
+ .addRow("XXXX00000003", "Ethan", "Stein", null, null, 10028, "US",
LocalDate.parse("1991-04-11").atStartOfDay().toInstant(ZoneOffset.UTC), "NY")
+ .addRow("XXXX00000004", "Mohammed", "Fatima", null, "Baltimore", 21202,
"US", LocalDate.parse("1990-05-15").atStartOfDay().toInstant(ZoneOffset.UTC),
"MD")
+ .addRow("XXXX00000005", "Yakov", "Borodin", null, "Teaneck", 7666, "US",
LocalDate.parse("1986-12-20").atStartOfDay().toInstant(ZoneOffset.UTC), "NJ")
+ .addRow("XXXX00000006", "Akhil", "Chavda", null, null, null, "US", null,
null)
+ .addRow("XXXX00000007", "Mark", "Rahman", null, "Ellicott City", 21043,
null, LocalDate.parse("1974-06-13").atStartOfDay().toInstant(ZoneOffset.UTC),
"MD")
+ .addRow("XXXX00000008", "Henry", "Smith", "[email protected]", null, null,
null, null, null)
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
}
diff --git a/contrib/format-excel/src/test/resources/excel/missing_data.xlsx
b/contrib/format-excel/src/test/resources/excel/missing_data.xlsx
new file mode 100644
index 0000000..8d38bb8
Binary files /dev/null and
b/contrib/format-excel/src/test/resources/excel/missing_data.xlsx differ