[GitHub] [drill] vvysotskyi commented on a change in pull request #2069: DRILL-7723 - Add Excel Metadata as Implicit Fields

GitBox Sat, 23 May 2020 12:36:30 -0700


vvysotskyi commented on a change in pull request #2069:
URL: https://github.com/apache/drill/pull/2069#discussion_r429570614




##########
File path: 
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -75,7 +83,9 @@
 
   private Row currentRow;
 
-  private Workbook workbook;
+  private StreamingWorkbook swb;

Review comment:
       Is there any reason for using specific implementation instead of the 
interface here?

##########
File path: 
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -75,7 +83,9 @@
 
   private Row currentRow;
 
-  private Workbook workbook;
+  private StreamingWorkbook swb;
+
+  private CoreProperties fileMetadata;

Review comment:
       This field is obtained from the previous one. Are there any reasons for 
adding it?

##########
File path: 
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -147,10 +185,15 @@ private void 
openFile(FileScanFramework.FileSchemaNegotiator negotiator) {
       fsStream = 
negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
 
       // Open streaming reader
-      workbook = StreamingReader.builder()
+      Workbook workbook = StreamingReader.builder()
         .rowCacheSize(ROW_CACHE_SIZE)
         .bufferSize(BUFFER_SIZE)
+        .setReadCoreProperties(true)
         .open(fsStream);
+
+      swb =(StreamingWorkbook)workbook;

Review comment:
       ```suggestion
         swb = (StreamingWorkbook) workbook;
   ```

##########
File path: 
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -334,9 +382,18 @@ private boolean nextLine(RowSetLoader rowWriter) {
       colPosition++;
     }
 
+    if (firstLine) {

Review comment:
       This if may be combined with the next one.

##########
File path: 
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -199,7 +245,7 @@ private void getColumnHeaders(SchemaBuilder builder) {
         i++;
       }
       columnWriters = new ArrayList<>(columnCount);
-
+      metadataColumnWriters = new ArrayList<>();

Review comment:
       This field is initialized here, and after the if-else block, it is 
initialized with the same value. And `columnWriters` value also.

##########
File path: 
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -348,6 +405,27 @@ private boolean nextLine(RowSetLoader rowWriter) {
     }
   }
 
+  private void populateMetadata() {
+    stringMetadata = new HashMap<>();
+    dateMetadata = new HashMap<>();
+
+    stringMetadata.put(IMPLICIT_STRING_COLUMN_NAMES.get(0), 
fileMetadata.getCategory());

Review comment:
       Please do not rely on the index of the specific column. You may define 
an enum with implicit excel field names and use its elements here and iterate 
through all its elements where required instead of using the list.

##########
File path: 
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -420,15 +510,57 @@ private void addColumnToArray(TupleWriter rowWriter, 
String name, TypeProtos.Min
     }
   }
 
+  private void addMetadataWriters() {
+    for (String colName : IMPLICIT_STRING_COLUMN_NAMES) {
+      addMetadataColumnsToArray(rowWriter, colName, MinorType.VARCHAR);
+    }
+    for (String colName : IMPLICIT_TIMESTAMP_COLUMN_NAMES) {
+      addMetadataColumnsToArray(rowWriter, colName, MinorType.TIMESTAMP);
+    }
+  }
+
+  private void addMetadataColumnsToArray(TupleWriter rowWriter, String name, 
MinorType type) {
+    int index = rowWriter.tupleSchema().index(name);
+    if (index == -1) {
+      ColumnMetadata colSchema = MetadataUtils.newScalar(name, type, 
TypeProtos.DataMode.OPTIONAL);
+      colSchema.setBooleanProperty(ColumnMetadata.EXCLUDE_FROM_WILDCARD, true);
+      index = rowWriter.addColumn(colSchema);
+    } else {
+      return;
+    }
+    metadataColumnWriters.add(rowWriter.scalar(index));
+  }
+
+  private void writeMetadata() {
+    for (int i = 0; i < metadataColumnWriters.size(); i++) {

Review comment:
       `metadataColumnWriters` is populated using 
`IMPLICIT_STRING_COLUMN_NAMES` and `IMPLICIT_TIMESTAMP_COLUMN_NAMES` lists. Is 
it possible instead of iterating through `metadataColumnWriters` values and 
adding a check for the index, iterate through values of the lists only?

##########
File path: 
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -75,7 +83,9 @@
 
   private Row currentRow;
 
-  private Workbook workbook;
+  private StreamingWorkbook swb;

Review comment:
       The initial name was clearer.

##########
File path: 
contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
##########
@@ -112,6 +112,42 @@ public void testExplicitAllQuery() throws RpcException {
     new RowSetComparison(expected).verifyAndClearAll(results);
   }
 
+
+  @Test
+  public void testExplicitMetadataQuery() throws RpcException {
+    String sql =
+      "SELECT _category, _content_status, _content_type, _creator, 
_description, _identifier, _keywords, _last_modified_by_user, _revision, 
_subject, _title, _created," +
+        "_last_printed, _modified FROM cp.`excel/test_data.xlsx` LIMIT 1";
+
+    QueryBuilder q = client.queryBuilder().sql(sql);
+    RowSet results = q.rowSet();
+    results.print();

Review comment:
       Please remove printing the result.

##########
File path: 
contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
##########
@@ -420,15 +510,57 @@ private void addColumnToArray(TupleWriter rowWriter, 
String name, TypeProtos.Min
     }
   }
 
+  private void addMetadataWriters() {
+    for (String colName : IMPLICIT_STRING_COLUMN_NAMES) {
+      addMetadataColumnsToArray(rowWriter, colName, MinorType.VARCHAR);
+    }
+    for (String colName : IMPLICIT_TIMESTAMP_COLUMN_NAMES) {
+      addMetadataColumnsToArray(rowWriter, colName, MinorType.TIMESTAMP);
+    }
+  }
+
+  private void addMetadataColumnsToArray(TupleWriter rowWriter, String name, 
MinorType type) {
+    int index = rowWriter.tupleSchema().index(name);
+    if (index == -1) {
+      ColumnMetadata colSchema = MetadataUtils.newScalar(name, type, 
TypeProtos.DataMode.OPTIONAL);
+      colSchema.setBooleanProperty(ColumnMetadata.EXCLUDE_FROM_WILDCARD, true);
+      index = rowWriter.addColumn(colSchema);
+    } else {
+      return;
+    }

Review comment:
       This part of the code is similar to the code from `addColumnToArray` 
method. Please refactor it to avoid code duplication.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [drill] vvysotskyi commented on a change in pull request #2069: DRILL-7723 - Add Excel Metadata as Implicit Fields

Reply via email to