[
https://issues.apache.org/jira/browse/FLINK-2170?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16178632#comment-16178632
]
ASF GitHub Bot commented on FLINK-2170:
---------------------------------------
Github user haohui commented on a diff in the pull request:
https://github.com/apache/flink/pull/4670#discussion_r140700434
--- Diff:
flink-connectors/flink-orc/src/main/java/org/apache/flink/orc/OrcUtils.java ---
@@ -0,0 +1,2229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.orc;
+
+import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
+import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
+import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.typeutils.MapTypeInfo;
+import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo;
+import org.apache.flink.api.java.typeutils.RowTypeInfo;
+import org.apache.flink.types.Row;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+
+import org.apache.orc.TypeDescription;
+
+import java.lang.reflect.Array;
+import java.math.BigDecimal;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+/**
+ * A class that provides utility methods for orc file reading.
+ */
+public class OrcUtils {
+
+ /**
+ * Convert ORC schema types to Flink types.
+ *
+ * @param schema schema of orc file
+ *
+ */
+ public static TypeInformation schemaToTypeInfo(TypeDescription schema) {
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ return BasicTypeInfo.BOOLEAN_TYPE_INFO;
+ case BYTE:
+ return BasicTypeInfo.BYTE_TYPE_INFO;
+ case SHORT:
+ return BasicTypeInfo.SHORT_TYPE_INFO;
+ case INT:
+ return BasicTypeInfo.INT_TYPE_INFO;
+ case LONG:
+ return BasicTypeInfo.LONG_TYPE_INFO;
+ case FLOAT:
+ return BasicTypeInfo.FLOAT_TYPE_INFO;
+ case DOUBLE:
+ return BasicTypeInfo.DOUBLE_TYPE_INFO;
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return BasicTypeInfo.STRING_TYPE_INFO;
+ case DATE:
+ return SqlTimeTypeInfo.DATE;
+ case TIMESTAMP:
+ return SqlTimeTypeInfo.TIMESTAMP;
+ case BINARY:
+ return
PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO;
+ case STRUCT:
+ List<TypeDescription> fieldSchemas =
schema.getChildren();
+ TypeInformation[] fieldTypes = new
TypeInformation[fieldSchemas.size()];
+ for (int i = 0; i < fieldSchemas.size(); i++) {
+ fieldTypes[i] =
schemaToTypeInfo(fieldSchemas.get(i));
+ }
+ String[] fieldNames =
schema.getFieldNames().toArray(new String[]{});
+ return new RowTypeInfo(fieldTypes, fieldNames);
+ case LIST:
+ TypeDescription elementSchema =
schema.getChildren().get(0);
+ TypeInformation elementType =
schemaToTypeInfo(elementSchema);
+ return
ObjectArrayTypeInfo.getInfoFor(elementType);
+ case MAP:
+ TypeDescription keySchema =
schema.getChildren().get(0);
+ TypeDescription valSchema =
schema.getChildren().get(1);
+ TypeInformation keyType =
schemaToTypeInfo(keySchema);
+ TypeInformation valType =
schemaToTypeInfo(valSchema);
+ return new MapTypeInfo(keyType, valType);
+ case DECIMAL:
+ return BasicTypeInfo.BIG_DEC_TYPE_INFO;
+ case UNION:
+ throw new UnsupportedOperationException("UNION
type not supported yet.");
+ default:
+ throw new IllegalArgumentException("Unknown
type " + schema);
+ }
+ }
+
+ /**
+ * Fill rows from orc batch.
+ *
+ * @param rows the batch of rows need to be filled
+ * @param schema schema of orc file
+ * @param batch current orc batch data used to fill the rows
+ * @param fieldMapping field mapping
+ *
+ */
+ public static void fillRows(Object[] rows, TypeDescription schema,
VectorizedRowBatch batch, int[] fieldMapping) {
+
+ int totalRowsInBatch = (int) batch.count();
+
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ for (int outIdx = 0; outIdx < fieldMapping.length; outIdx++) {
+ int inIdx = fieldMapping[outIdx];
+ readField(rows, outIdx, fieldTypes.get(inIdx),
batch.cols[inIdx], null, Math.min((int) totalRowsInBatch, rows.length));
+ }
+ }
+
+ private static void readField(Object[] rows, int fieldIdx,
TypeDescription schema, ColumnVector vector, long[] lengthVector, int
childCount) {
--- End diff --
There are significant of duplications across these helper functions. It
requires some refactoring to clean things up.
> Add OrcTableSource
> ------------------
>
> Key: FLINK-2170
> URL: https://issues.apache.org/jira/browse/FLINK-2170
> Project: Flink
> Issue Type: New Feature
> Components: Table API & SQL
> Affects Versions: 0.9
> Reporter: Fabian Hueske
> Assignee: Usman Younas
> Priority: Minor
> Labels: starter
>
> Add a {{OrcTableSource}} to read data from an ORC file. The
> {{OrcTableSource}} should implement the {{ProjectableTableSource}}
> (FLINK-3848) and {{FilterableTableSource}} (FLINK-3849) interfaces.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)