[ https://issues.apache.org/jira/browse/DRILL-4982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15731190#comment-15731190 ]
ASF GitHub Bot commented on DRILL-4982: --------------------------------------- Github user paul-rogers commented on a diff in the pull request: https://github.com/apache/drill/pull/638#discussion_r91446474 --- Diff: contrib/storage-hive/core/src/main/codegen/templates/HiveRecordReaders.java --- @@ -0,0 +1,300 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This template is used to generate different Hive record reader classes for different data formats + * to avoid JIT profile pullusion. These readers are derived from HiveAbstractReader which implements + * codes for init and setup stage, but the repeated - and performance critical part - next() method is + * separately implemented in the classes generated from this template. The internal SkipRecordReeader + * class is also separated as well due to the same reason. + * + * As to the performance gain with this change, please refer to: + * https://issues.apache.org/jira/browse/DRILL-4982 + * + */ +<@pp.dropOutputFile /> +<#list hiveFormat.map as entry> +<@pp.changeOutputFile name="/org/apache/drill/exec/store/hive/Hive${entry.hiveReader}Reader.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.drill.exec.store.hive; + +import java.io.IOException; +import java.util.List; +import java.util.Properties; +import org.apache.drill.common.exceptions.DrillRuntimeException; +import org.apache.drill.common.exceptions.ExecutionSetupException; +import org.apache.drill.common.expression.SchemaPath; +import org.apache.drill.exec.ops.FragmentContext; +import org.apache.drill.exec.vector.AllocationHelper; +import org.apache.drill.exec.vector.ValueVector; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.hive.conf.HiveConf; + +import org.apache.hadoop.hive.serde2.SerDeException; + +import org.apache.hadoop.mapred.RecordReader; +<#if entry.hasHeaderFooter == true> +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import com.google.common.collect.Lists; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Queue; +import java.util.Set; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.serde.serdeConstants; +</#if> + +public class Hive${entry.hiveReader}Reader extends HiveAbstractReader { + + Object key; +<#if entry.hasHeaderFooter == true> --- End diff -- Free marker allows simple boolean expressions: `<#if entry.hasHeaderFooter>` > Hive Queries degrade when queries switch between different formats > ------------------------------------------------------------------ > > Key: DRILL-4982 > URL: https://issues.apache.org/jira/browse/DRILL-4982 > Project: Apache Drill > Issue Type: Bug > Reporter: Chunhui Shi > Assignee: Karthikeyan Manivannan > Priority: Critical > Fix For: 1.10.0 > > > We have seen degraded performance by doing these steps: > 1) generate the repro data: > python script repro.py as below: > import string > import random > > for i in range(30000000): > x1 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ > in range(random.randrange(19, 27))) > x2 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ > in range(random.randrange(19, 27))) > x3 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ > in range(random.randrange(19, 27))) > x4 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ > in range(random.randrange(19, 27))) > x5 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ > in range(random.randrange(19, 27))) > x6 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ > in range(random.randrange(19, 27))) > print > "{0}".format(x1),"{0}".format(x2),"{0}".format(x3),"{0}".format(x4),"{0}".format(x5),"{0}".format(x6) > python repro.py > repro.csv > 2) put these files in a dfs directory e.g. '/tmp/hiveworkspace/plain'. Under > hive prompt, use the following sql command to create an external table: > CREATE EXTERNAL TABLE `hiveworkspace`.`plain` (`id1` string, `id2` string, > `id3` string, `id4` string, `id5` string, `id6` string) ROW FORMAT SERDE > 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS TEXTFILE LOCATION > '/tmp/hiveworkspace/plain' > 3) create Hive's table of ORC|PARQUET format: > CREATE TABLE `hiveworkspace`.`plainorc` STORED AS ORC AS SELECT > id1,id2,id3,id4,id5,id6 from `hiveworkspace`.`plain`; > CREATE TABLE `hiveworkspace`.`plainparquet` STORED AS PARQUET AS SELECT > id1,id2,id3,id4,id5,id6 from `hiveworkspace`.`plain`; > 4) Query switch between these two tables, then the query time on the same > table significantly lengthened. On my setup, for ORC, it was 15sec -> 26secs. > Queries on table of other formats, after injecting a query to other formats, > all have significant slow down. -- This message was sent by Atlassian JIRA (v6.3.4#6332)