[GitHub] [incubator-doris] EmmyMiao87 commented on a diff in pull request #9433: [feature-wip](parquet-vec) Support parquet scanner in vectorized engine

GitBox Thu, 12 May 2022 00:22:03 -0700


EmmyMiao87 commented on code in PR #9433:
URL: https://github.com/apache/incubator-doris/pull/9433#discussion_r871026218



##########
fe/fe-core/src/main/java/org/apache/doris/load/Load.java:
##########
@@ -1044,30 +1047,61 @@ private static void initColumns(Table tbl, 
List<ImportColumnDesc> columnExprs,
         if (!needInitSlotAndAnalyzeExprs) {
             return;
         }
-
+        Set<String> exprArgsColumns = 
Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);

Review Comment:
   Named "exprSrcSlotName" 



##########
fe/fe-core/src/main/java/org/apache/doris/load/Load.java:
##########
@@ -1044,26 +1047,57 @@ private static void initColumns(Table tbl, 
List<ImportColumnDesc> columnExprs,
         if (!needInitSlotAndAnalyzeExprs) {
             return;
         }
-
+        Set<String> exprArgsColumns = 
Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
+        for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
+            if (importColumnDesc.isColumn()) {
+                continue;
+            }
+            List<SlotRef> slots = Lists.newArrayList();
+            importColumnDesc.getExpr().collect(SlotRef.class, slots);
+            for (SlotRef slot : slots) {
+                String slotColumnName = slot.getColumnName();
+                exprArgsColumns.add(slotColumnName);
+            }
+        }
+        Set<String> excludedColumns = 
Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
         // init slot desc add expr map, also transform hadoop functions
         for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
             // make column name case match with real column name
             String columnName = importColumnDesc.getColumnName();
-            String realColName = tbl.getColumn(columnName) == null ? columnName
-                    : tbl.getColumn(columnName).getName();
+            Column tblColumn = tbl.getColumn(columnName);
+            String realColName =  tblColumn == null ? columnName : 
tblColumn.getName();
             if (importColumnDesc.getExpr() != null) {
                 Expr expr = transformHadoopFunctionExpr(tbl, realColName, 
importColumnDesc.getExpr());
                 exprsByName.put(realColName, expr);
             } else {
                 SlotDescriptor slotDesc = 
analyzer.getDescTbl().addSlotDescriptor(srcTupleDesc);
-                slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
+                // only support parquet format now
+                if (useVectorizedLoad  && formatType == 
TFileFormatType.FORMAT_PARQUET
+                        && tblColumn != null) {
+                    // in vectorized load
+                    if (exprArgsColumns.contains(columnName)) {
+                        // columns in expr args should be varchar type
+                        
slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
+                        slotDesc.setColumn(new Column(realColName, 
PrimitiveType.VARCHAR));
+                        excludedColumns.add(realColName);
+                    } else {
+                        // columns from files like parquet files can be parsed 
as the type in table schema
+                        slotDesc.setType(tblColumn.getType());
+                        slotDesc.setColumn(new Column(realColName, 
tblColumn.getType()));
+                    }
+                    // non-nullable column is allowed in vectorized load with 
parquet format

Review Comment:
   If the src slot is null, it will be error



##########
fe/fe-core/src/main/java/org/apache/doris/load/Load.java:
##########
@@ -1044,30 +1047,61 @@ private static void initColumns(Table tbl, 
List<ImportColumnDesc> columnExprs,
         if (!needInitSlotAndAnalyzeExprs) {
             return;
         }
-
+        Set<String> exprArgsColumns = 
Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
+        for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
+            if (importColumnDesc.isColumn()) {
+                continue;
+            }
+            List<SlotRef> slots = Lists.newArrayList();
+            importColumnDesc.getExpr().collect(SlotRef.class, slots);
+            for (SlotRef slot : slots) {
+                String slotColumnName = slot.getColumnName();
+                exprArgsColumns.add(slotColumnName);
+            }
+        }
+        Set<String> excludedColumns = 
Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
         // init slot desc add expr map, also transform hadoop functions
         for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
             // make column name case match with real column name
             String columnName = importColumnDesc.getColumnName();
             String realColName;
-            if (tbl.getColumn(columnName) == null || 
importColumnDesc.getExpr() == null) {
+            if (tblColumn == null || importColumnDesc.getExpr() == null) {
                 realColName = columnName;
             } else {
-                realColName = tbl.getColumn(columnName).getName();
+                realColName = tblColumn.getName();
             }
             if (importColumnDesc.getExpr() != null) {
                 Expr expr = transformHadoopFunctionExpr(tbl, realColName, 
importColumnDesc.getExpr());
                 exprsByName.put(realColName, expr);
             } else {
                 SlotDescriptor slotDesc = 
analyzer.getDescTbl().addSlotDescriptor(srcTupleDesc);
-                slotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR));
+                // only support parquet format now
+                if (useVectorizedLoad  && formatType == 
TFileFormatType.FORMAT_PARQUET
+                        && tblColumn != null) {
+                    // in vectorized load
+                    if (exprArgsColumns.contains(columnName)) {

Review Comment:
   How about following situation ?
   table schema: a, b
   src schema : a
   src expr: a, b= a+1 



##########
fe/fe-core/src/main/java/org/apache/doris/load/Load.java:
##########
@@ -1044,30 +1047,61 @@ private static void initColumns(Table tbl, 
List<ImportColumnDesc> columnExprs,
         if (!needInitSlotAndAnalyzeExprs) {
             return;
         }
-
+        Set<String> exprArgsColumns = 
Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
+        for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
+            if (importColumnDesc.isColumn()) {
+                continue;
+            }
+            List<SlotRef> slots = Lists.newArrayList();
+            importColumnDesc.getExpr().collect(SlotRef.class, slots);
+            for (SlotRef slot : slots) {
+                String slotColumnName = slot.getColumnName();
+                exprArgsColumns.add(slotColumnName);
+            }
+        }
+        Set<String> excludedColumns = 
Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
         // init slot desc add expr map, also transform hadoop functions
         for (ImportColumnDesc importColumnDesc : copiedColumnExprs) {
             // make column name case match with real column name
             String columnName = importColumnDesc.getColumnName();
             String realColName;
-            if (tbl.getColumn(columnName) == null || 
importColumnDesc.getExpr() == null) {
+            if (tblColumn == null || importColumnDesc.getExpr() == null) {

Review Comment:
   Where did you declare this variable?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [incubator-doris] EmmyMiao87 commented on a diff in pull request #9433: [feature-wip](parquet-vec) Support parquet scanner in vectorized engine

Reply via email to