[drill] branch master updated: DRILL-8106: do not use cellIterator because it skips empty cells (#2426)

cgivre Thu, 13 Jan 2022 12:10:14 -0800

This is an automated email from the ASF dual-hosted git repository.

cgivre pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/drill.git



The following commit(s) were added to refs/heads/master by this push:
     new bd0458e  DRILL-8106: do not use cellIterator because it skips empty 
cells (#2426)
bd0458e is described below

commit bd0458e027bc90c1332fa1d5376d9616b720213a
Author: PJ Fanning <[email protected]>
AuthorDate: Thu Jan 13 21:10:00 2022 +0100

    DRILL-8106: do not use cellIterator because it skips empty cells (#2426)
    
    * [DRILL-8106] do not use cellIterator because it skips empty cells
    
    * Update ExcelBatchReader.java
    
    * Update ExcelBatchReader.java
    
    * Update ExcelBatchReader.java
    
    * Update ExcelBatchReader.java
    
    * Create missing_data.xlsx
    
    * Update TestExcelFormat.java
    
    * Update TestExcelFormat.java
---
 .../drill/exec/store/excel/ExcelBatchReader.java   |  35 +++++++--------------
 .../drill/exec/store/excel/TestExcelFormat.java    |  30 ++++++++++++++++++
 .../src/test/resources/excel/missing_data.xlsx     | Bin 0 -> 10235 bytes
 3 files changed, 41 insertions(+), 24 deletions(-)

diff --git 
a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
 
b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index feabf3c..f167c6d 100644
--- 
a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ 
b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -58,7 +58,6 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
-import java.util.NoSuchElementException;
 
 public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
 
@@ -312,41 +311,31 @@ public class ExcelBatchReader implements 
ManagedReader<FileSchemaNegotiator> {
     //If there are no headers, create columns names of field_n
     if (readerConfig.headerRow == -1) {
       String missingFieldName;
-      int i = 0;
 
-      for (Cell c : currentRow) {
-        missingFieldName = MISSING_FIELD_NAME_HEADER + (i + 1);
+      for (short colNum = 0; colNum < currentRow.getLastCellNum(); colNum++) {
+        missingFieldName = MISSING_FIELD_NAME_HEADER + (colNum + 1);
         makeColumn(builder, missingFieldName, MinorType.VARCHAR);
-        excelFieldNames.add(i, missingFieldName);
-        i++;
+        excelFieldNames.add(colNum, missingFieldName);
       }
       builder.buildSchema();
     } else if (rowIterator.hasNext()) {
       //Get the header row and column count
       totalColumnCount = currentRow.getLastCellNum();
-      Cell dataCell = null;
 
       //Read the header row
-      Iterator<Cell> headerRowIterator = currentRow.cellIterator();
-      int colPosition = 0;
+      Row headerRow = currentRow;
       String tempColumnName;
 
       // Get the first data row.
       currentRow = rowIterator.next();
       Row firstDataRow = currentRow;
-      Iterator<Cell> dataRowIterator = firstDataRow.cellIterator();
 
-
-      while (headerRowIterator.hasNext()) {
+      for (short colPosition = 0; colPosition < totalColumnCount; 
colPosition++) {
         // We need this to get the header names
-        Cell cell = headerRowIterator.next();
+        Cell cell = headerRow.getCell(colPosition, 
Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
 
         // Since header names are most likely all Strings, we need the first 
row of actual data to get the data types
-        try {
-          dataCell = dataRowIterator.next();
-        } catch (NoSuchElementException e) {
-          // Do nothing... empty value in data cell
-        }
+        Cell dataCell = firstDataRow.getCell(colPosition, 
Row.MissingCellPolicy.CREATE_NULL_AS_BLANK);
 
         switch (dataCell.getCellType()) {
           case STRING:
@@ -360,10 +349,7 @@ public class ExcelBatchReader implements 
ManagedReader<FileSchemaNegotiator> {
             makeColumn(builder, tempColumnName, MinorType.VARCHAR);
             excelFieldNames.add(colPosition, tempColumnName);
             break;
-          case FORMULA:
-          case NUMERIC:
-          case _NONE:
-          case BLANK:
+          default:
             tempColumnName = cell.getStringCellValue();
 
             // Remove leading and trailing whitespace
@@ -372,7 +358,6 @@ public class ExcelBatchReader implements 
ManagedReader<FileSchemaNegotiator> {
             excelFieldNames.add(colPosition, tempColumnName);
             break;
         }
-        colPosition++;
       }
     }
     addMetadataToSchema(builder);
@@ -484,7 +469,9 @@ public class ExcelBatchReader implements 
ManagedReader<FileSchemaNegotiator> {
       Cell cell = currentRow.getCell(colPosition);
 
       populateColumnArray(cell, colPosition);
-      cellWriterArray.get(colWriterIndex).load(cell);
+      if (colWriterIndex < cellWriterArray.size()) {
+        cellWriterArray.get(colWriterIndex).load(cell);
+      }
 
       colPosition++;
     }
diff --git 
a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
 
b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index a8caba1..e9304e2 100644
--- 
a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ 
b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -704,4 +704,34 @@ public class TestExcelFormat extends ClusterTest {
     new RowSetComparison(expected).verifyAndClearAll(results);
   }
 
+  @Test
+  public void testMissingData() throws RpcException {
+    String sql = "SELECT * FROM dfs.`excel/missing_data.xlsx`";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("original_id", MinorType.VARCHAR)
+      .addNullable("original_nameFirst", MinorType.VARCHAR)
+      .addNullable("original_nameLast", MinorType.VARCHAR)
+      .addNullable("original_emailWork", MinorType.VARCHAR)
+      .addNullable("original_cityHome", MinorType.VARCHAR)
+      .addNullable("original_zipcodeHome", MinorType.FLOAT8)
+      .addNullable("original_countryHome", MinorType.VARCHAR)
+      .addNullable("original_birthday", MinorType.TIMESTAMP)
+      .addNullable("original_stateHome", MinorType.VARCHAR)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow("XXXX00000001", "James", "Kushner", null, null, 10235, "US", 
LocalDate.parse("1957-04-18").atStartOfDay().toInstant(ZoneOffset.UTC), "NY")
+      .addRow("XXXX00000002", "Steve", "Hecht", null, null, 11213, "US", 
LocalDate.parse("1982-08-10").atStartOfDay().toInstant(ZoneOffset.UTC), "NY")
+      .addRow("XXXX00000003", "Ethan", "Stein", null, null, 10028, "US", 
LocalDate.parse("1991-04-11").atStartOfDay().toInstant(ZoneOffset.UTC), "NY")
+      .addRow("XXXX00000004", "Mohammed", "Fatima", null, "Baltimore", 21202, 
"US", LocalDate.parse("1990-05-15").atStartOfDay().toInstant(ZoneOffset.UTC), 
"MD")
+      .addRow("XXXX00000005", "Yakov", "Borodin", null, "Teaneck", 7666, "US", 
LocalDate.parse("1986-12-20").atStartOfDay().toInstant(ZoneOffset.UTC), "NJ")
+      .addRow("XXXX00000006", "Akhil", "Chavda", null, null, null, "US", null, 
null)
+      .addRow("XXXX00000007", "Mark", "Rahman", null, "Ellicott City", 21043, 
null, LocalDate.parse("1974-06-13").atStartOfDay().toInstant(ZoneOffset.UTC), 
"MD")
+      .addRow("XXXX00000008", "Henry", "Smith", "[email protected]", null, null, 
null, null, null)
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
 }
diff --git a/contrib/format-excel/src/test/resources/excel/missing_data.xlsx 
b/contrib/format-excel/src/test/resources/excel/missing_data.xlsx
new file mode 100644
index 0000000..8d38bb8
Binary files /dev/null and 
b/contrib/format-excel/src/test/resources/excel/missing_data.xlsx differ

[drill] branch master updated: DRILL-8106: do not use cellIterator because it skips empty cells (#2426)

Reply via email to