[
https://issues.apache.org/jira/browse/FLINK-2170?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16182635#comment-16182635
]
ASF GitHub Bot commented on FLINK-2170:
---------------------------------------
Github user uybhatti commented on a diff in the pull request:
https://github.com/apache/flink/pull/4670#discussion_r141356723
--- Diff:
flink-connectors/flink-orc/src/main/java/org/apache/flink/orc/OrcUtils.java ---
@@ -0,0 +1,2229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.orc;
+
+import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
+import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo;
+import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.typeutils.MapTypeInfo;
+import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo;
+import org.apache.flink.api.java.typeutils.RowTypeInfo;
+import org.apache.flink.types.Row;
+
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
+
+import org.apache.orc.TypeDescription;
+
+import java.lang.reflect.Array;
+import java.math.BigDecimal;
+import java.sql.Date;
+import java.sql.Timestamp;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+/**
+ * A class that provides utility methods for orc file reading.
+ */
+public class OrcUtils {
+
+ /**
+ * Convert ORC schema types to Flink types.
+ *
+ * @param schema schema of orc file
+ *
+ */
+ public static TypeInformation schemaToTypeInfo(TypeDescription schema) {
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ return BasicTypeInfo.BOOLEAN_TYPE_INFO;
+ case BYTE:
+ return BasicTypeInfo.BYTE_TYPE_INFO;
+ case SHORT:
+ return BasicTypeInfo.SHORT_TYPE_INFO;
+ case INT:
+ return BasicTypeInfo.INT_TYPE_INFO;
+ case LONG:
+ return BasicTypeInfo.LONG_TYPE_INFO;
+ case FLOAT:
+ return BasicTypeInfo.FLOAT_TYPE_INFO;
+ case DOUBLE:
+ return BasicTypeInfo.DOUBLE_TYPE_INFO;
+ case STRING:
+ case CHAR:
+ case VARCHAR:
+ return BasicTypeInfo.STRING_TYPE_INFO;
+ case DATE:
+ return SqlTimeTypeInfo.DATE;
+ case TIMESTAMP:
+ return SqlTimeTypeInfo.TIMESTAMP;
+ case BINARY:
+ return
PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO;
+ case STRUCT:
+ List<TypeDescription> fieldSchemas =
schema.getChildren();
+ TypeInformation[] fieldTypes = new
TypeInformation[fieldSchemas.size()];
+ for (int i = 0; i < fieldSchemas.size(); i++) {
+ fieldTypes[i] =
schemaToTypeInfo(fieldSchemas.get(i));
+ }
+ String[] fieldNames =
schema.getFieldNames().toArray(new String[]{});
+ return new RowTypeInfo(fieldTypes, fieldNames);
+ case LIST:
+ TypeDescription elementSchema =
schema.getChildren().get(0);
+ TypeInformation elementType =
schemaToTypeInfo(elementSchema);
+ return
ObjectArrayTypeInfo.getInfoFor(elementType);
+ case MAP:
+ TypeDescription keySchema =
schema.getChildren().get(0);
+ TypeDescription valSchema =
schema.getChildren().get(1);
+ TypeInformation keyType =
schemaToTypeInfo(keySchema);
+ TypeInformation valType =
schemaToTypeInfo(valSchema);
+ return new MapTypeInfo(keyType, valType);
+ case DECIMAL:
+ return BasicTypeInfo.BIG_DEC_TYPE_INFO;
+ case UNION:
+ throw new UnsupportedOperationException("UNION
type not supported yet.");
+ default:
+ throw new IllegalArgumentException("Unknown
type " + schema);
+ }
+ }
+
+ /**
+ * Fill rows from orc batch.
+ *
+ * @param rows the batch of rows need to be filled
+ * @param schema schema of orc file
+ * @param batch current orc batch data used to fill the rows
+ * @param fieldMapping field mapping
+ *
+ */
+ public static void fillRows(Object[] rows, TypeDescription schema,
VectorizedRowBatch batch, int[] fieldMapping) {
+
+ int totalRowsInBatch = (int) batch.count();
+
+ List<TypeDescription> fieldTypes = schema.getChildren();
+ for (int outIdx = 0; outIdx < fieldMapping.length; outIdx++) {
+ int inIdx = fieldMapping[outIdx];
+ readField(rows, outIdx, fieldTypes.get(inIdx),
batch.cols[inIdx], null, Math.min((int) totalRowsInBatch, rows.length));
+ }
+ }
+
+ private static void readField(Object[] rows, int fieldIdx,
TypeDescription schema, ColumnVector vector, long[] lengthVector, int
childCount) {
+
+ switch (schema.getCategory()) {
+ case BOOLEAN:
+ if (vector.noNulls) {
+ readNonNullBooleanColumn(rows,
fieldIdx, (LongColumnVector) vector, lengthVector, childCount);
+ } else {
+ readBooleanColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case BYTE:
+ if (vector.noNulls) {
+ readNonNullByteColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ } else {
+ readByteColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case SHORT:
+ if (vector.noNulls) {
+ readNonNullShortColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ } else {
+ readShortColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case INT:
+ if (vector.noNulls) {
+ readNonNullIntColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ } else {
+ readIntColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case LONG:
+ if (vector.noNulls) {
+ readNonNullLongColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ } else {
+ readLongColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case FLOAT:
+ if (vector.noNulls) {
+ readNonNullFloatColumn(rows, fieldIdx,
(DoubleColumnVector) vector, lengthVector, childCount);
+ } else {
+ readFloatColumn(rows, fieldIdx,
(DoubleColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case DOUBLE:
+ if (vector.noNulls) {
+ readNonNullDoubleColumn(rows, fieldIdx,
(DoubleColumnVector) vector, lengthVector, childCount);
+ } else {
+ readDoubleColumn(rows, fieldIdx,
(DoubleColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case CHAR:
+ case VARCHAR:
+ case STRING:
+ if (vector.noNulls) {
+ readNonNullStringColumn(rows, fieldIdx,
(BytesColumnVector) vector, lengthVector, childCount);
+ } else {
+ readStringColumn(rows, fieldIdx,
(BytesColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case DATE:
+ if (vector.noNulls) {
+ readNonNullDateColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ } else {
+ readDateColumn(rows, fieldIdx,
(LongColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case TIMESTAMP:
+ if (vector.noNulls) {
+ readNonNullTimestampColumn(rows,
fieldIdx, (TimestampColumnVector) vector, lengthVector, childCount);
+ } else {
+ readTimestampColumn(rows, fieldIdx,
(TimestampColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case BINARY:
+ if (vector.noNulls) {
+ readNonNullBinaryColumn(rows, fieldIdx,
(BytesColumnVector) vector, lengthVector, childCount);
+ } else {
+ readBinaryColumn(rows, fieldIdx,
(BytesColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case DECIMAL:
+ if (vector.noNulls) {
+ readNonNullDecimalColumn(rows,
fieldIdx, (DecimalColumnVector) vector, lengthVector, childCount);
+ }
+ else {
+ readDecimalColumn(rows, fieldIdx,
(DecimalColumnVector) vector, lengthVector, childCount);
+ }
+ break;
+ case STRUCT:
+ if (vector.noNulls) {
+ readNonNullStructColumn(rows, fieldIdx,
(StructColumnVector) vector, schema, lengthVector, childCount);
+ } else {
+ readStructColumn(rows, fieldIdx,
(StructColumnVector) vector, schema, lengthVector, childCount);
+ }
+ break;
+ case LIST:
+ if (vector.noNulls) {
+ readNonNullListColumn(rows, fieldIdx,
(ListColumnVector) vector, schema, lengthVector, childCount);
+ }
+ else {
+ readListColumn(rows, fieldIdx,
(ListColumnVector) vector, schema, lengthVector, childCount);
+ }
+ break;
+ case MAP:
+ if (vector.noNulls) {
+ readNonNullMapColumn(rows, fieldIdx,
(MapColumnVector) vector, schema, lengthVector, childCount);
+ }
+ else {
+ readMapColumn(rows, fieldIdx,
(MapColumnVector) vector, schema, lengthVector, childCount);
+ }
+ break;
+ case UNION:
+ throw new UnsupportedOperationException("UNION
type not supported yet");
+ default:
+ throw new IllegalArgumentException("Unknown
type " + schema);
+ }
+ }
+
+ private static void readNonNullBooleanColumn(Object[] rows, int
fieldIdx, LongColumnVector vector, long[] lengthVector, int childCount) {
--- End diff --
Hi @haohui, Yes true there is a lot of code duplication in helper methods,
but I think it's required as I want to prefer performance over code lines/code
quality. For example, I can make one method for boolean, byte, short, int,
long, double and float. But then for every element in a column, first I have to
check the type and then have to cast it according to this type. Which is not
efficient.
Also, I am not checking `if` condition in a `for` loop, rather I am running
`for` loop depending upon the `if` condition. Which is also a reason for code
duplication.
Thanks for your feedback, please give me more feedback on these points.
Thanks, Usman
> Add OrcTableSource
> ------------------
>
> Key: FLINK-2170
> URL: https://issues.apache.org/jira/browse/FLINK-2170
> Project: Flink
> Issue Type: New Feature
> Components: Table API & SQL
> Affects Versions: 0.9
> Reporter: Fabian Hueske
> Assignee: Usman Younas
> Priority: Minor
> Labels: starter
>
> Add a {{OrcTableSource}} to read data from an ORC file. The
> {{OrcTableSource}} should implement the {{ProjectableTableSource}}
> (FLINK-3848) and {{FilterableTableSource}} (FLINK-3849) interfaces.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)