[
https://issues.apache.org/jira/browse/DRILL-4982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15731188#comment-15731188
]
ASF GitHub Bot commented on DRILL-4982:
---------------------------------------
Github user paul-rogers commented on a diff in the pull request:
https://github.com/apache/drill/pull/638#discussion_r91447651
--- Diff:
contrib/storage-hive/core/src/main/codegen/templates/HiveRecordReaders.java ---
@@ -0,0 +1,300 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * This template is used to generate different Hive record reader classes
for different data formats
+ * to avoid JIT profile pullusion. These readers are derived from
HiveAbstractReader which implements
+ * codes for init and setup stage, but the repeated - and performance
critical part - next() method is
+ * separately implemented in the classes generated from this template. The
internal SkipRecordReeader
+ * class is also separated as well due to the same reason.
+ *
+ * As to the performance gain with this change, please refer to:
+ * https://issues.apache.org/jira/browse/DRILL-4982
+ *
+ */
+<@pp.dropOutputFile />
+<#list hiveFormat.map as entry>
+<@pp.changeOutputFile
name="/org/apache/drill/exec/store/hive/Hive${entry.hiveReader}Reader.java" />
+<#include "/@includes/license.ftl" />
+
+package org.apache.drill.exec.store.hive;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Properties;
+import org.apache.drill.common.exceptions.DrillRuntimeException;
+import org.apache.drill.common.exceptions.ExecutionSetupException;
+import org.apache.drill.common.expression.SchemaPath;
+import org.apache.drill.exec.ops.FragmentContext;
+import org.apache.drill.exec.vector.AllocationHelper;
+import org.apache.drill.exec.vector.ValueVector;
+import org.apache.hadoop.hive.metastore.api.Partition;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.hive.conf.HiveConf;
+
+import org.apache.hadoop.hive.serde2.SerDeException;
+
+import org.apache.hadoop.mapred.RecordReader;
+<#if entry.hasHeaderFooter == true>
+import org.apache.hadoop.hive.serde2.SerDe;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import com.google.common.collect.Lists;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Queue;
+import java.util.Set;
+import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
+import org.apache.hadoop.hive.serde.serdeConstants;
+</#if>
+
+public class Hive${entry.hiveReader}Reader extends HiveAbstractReader {
+
+ Object key;
+<#if entry.hasHeaderFooter == true>
+ SkipRecordsInspector skipRecordsInspector;
+<#else>
+ Object value;
+</#if>
+
+ public Hive${entry.hiveReader}Reader(Table table, Partition partition,
InputSplit inputSplit, List<SchemaPath> projectedColumns,
+ FragmentContext context, final HiveConf hiveConf,
+ UserGroupInformation proxyUgi) throws
ExecutionSetupException {
+ super(table, partition, inputSplit, projectedColumns, context,
hiveConf, proxyUgi);
+ }
+
+ public void internalInit(Properties tableProperties,
RecordReader<Object, Object> reader) {
+
+ key = reader.createKey();
+<#if entry.hasHeaderFooter == true>
+ skipRecordsInspector = new SkipRecordsInspector(tableProperties,
reader);
+<#else>
+ value = reader.createValue();
+</#if>
+
+ }
+ private void readHiveRecordAndInsertIntoRecordBatch(Object
deSerializedValue, int outputRecordIndex) {
+ for (int i = 0; i < selectedStructFieldRefs.size(); i++) {
+ Object hiveValue = finalOI.getStructFieldData(deSerializedValue,
selectedStructFieldRefs.get(i));
+ if (hiveValue != null) {
+
selectedColumnFieldConverters.get(i).setSafeValue(selectedColumnObjInspectors.get(i),
hiveValue,
+ vectors.get(i), outputRecordIndex);
+ }
+ }
+ }
+
+<#if entry.hasHeaderFooter == true>
+ @Override
+ public int next() {
+ for (ValueVector vv : vectors) {
+ AllocationHelper.allocateNew(vv, TARGET_RECORD_COUNT);
+ }
+ if (empty) {
+ setValueCountAndPopulatePartitionVectors(0);
+ return 0;
+ }
+
+ try {
+ skipRecordsInspector.reset();
+ Object value;
+
+ int recordCount = 0;
+
+ while (recordCount < TARGET_RECORD_COUNT && reader.next(key, value =
skipRecordsInspector.getNextValue())) {
+ if (skipRecordsInspector.doSkipHeader(recordCount++)) {
+ continue;
+ }
+ Object bufferedValue = skipRecordsInspector.bufferAdd(value);
+ if (bufferedValue != null) {
+ Object deSerializedValue = partitionSerDe.deserialize((Writable)
bufferedValue);
+ if (partTblObjectInspectorConverter != null) {
+ deSerializedValue =
partTblObjectInspectorConverter.convert(deSerializedValue);
+ }
+ readHiveRecordAndInsertIntoRecordBatch(deSerializedValue,
skipRecordsInspector.getActualCount());
+ skipRecordsInspector.incrementActualCount();
+ }
+ skipRecordsInspector.incrementTempCount();
+ }
+
+
setValueCountAndPopulatePartitionVectors(skipRecordsInspector.getActualCount());
+ skipRecordsInspector.updateContinuance();
+ return skipRecordsInspector.getActualCount();
+ } catch (IOException | SerDeException e) {
+ throw new DrillRuntimeException(e);
+ }
+ }
+
+/**
+ * SkipRecordsInspector encapsulates logic to skip header and footer from
file.
+ * Logic is applicable only for predefined in constructor file formats.
+ */
+protected class SkipRecordsInspector {
--- End diff --
Small point, but nested classes are usually indented one level to help
highlight that they are, in fact, nested.
> Hive Queries degrade when queries switch between different formats
> ------------------------------------------------------------------
>
> Key: DRILL-4982
> URL: https://issues.apache.org/jira/browse/DRILL-4982
> Project: Apache Drill
> Issue Type: Bug
> Reporter: Chunhui Shi
> Assignee: Karthikeyan Manivannan
> Priority: Critical
> Fix For: 1.10.0
>
>
> We have seen degraded performance by doing these steps:
> 1) generate the repro data:
> python script repro.py as below:
> import string
> import random
>
> for i in range(30000000):
> x1 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _
> in range(random.randrange(19, 27)))
> x2 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _
> in range(random.randrange(19, 27)))
> x3 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _
> in range(random.randrange(19, 27)))
> x4 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _
> in range(random.randrange(19, 27)))
> x5 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _
> in range(random.randrange(19, 27)))
> x6 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _
> in range(random.randrange(19, 27)))
> print
> "{0}".format(x1),"{0}".format(x2),"{0}".format(x3),"{0}".format(x4),"{0}".format(x5),"{0}".format(x6)
> python repro.py > repro.csv
> 2) put these files in a dfs directory e.g. '/tmp/hiveworkspace/plain'. Under
> hive prompt, use the following sql command to create an external table:
> CREATE EXTERNAL TABLE `hiveworkspace`.`plain` (`id1` string, `id2` string,
> `id3` string, `id4` string, `id5` string, `id6` string) ROW FORMAT SERDE
> 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS TEXTFILE LOCATION
> '/tmp/hiveworkspace/plain'
> 3) create Hive's table of ORC|PARQUET format:
> CREATE TABLE `hiveworkspace`.`plainorc` STORED AS ORC AS SELECT
> id1,id2,id3,id4,id5,id6 from `hiveworkspace`.`plain`;
> CREATE TABLE `hiveworkspace`.`plainparquet` STORED AS PARQUET AS SELECT
> id1,id2,id3,id4,id5,id6 from `hiveworkspace`.`plain`;
> 4) Query switch between these two tables, then the query time on the same
> table significantly lengthened. On my setup, for ORC, it was 15sec -> 26secs.
> Queries on table of other formats, after injecting a query to other formats,
> all have significant slow down.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)