[jira] [Commented] (DRILL-4982) Hive Queries degrade when queries switch between different formats

ASF GitHub Bot (JIRA) Wed, 07 Dec 2016 21:42:20 -0800

    [ 
https://issues.apache.org/jira/browse/DRILL-4982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15731189#comment-15731189
 ]


ASF GitHub Bot commented on DRILL-4982:
---------------------------------------

Github user paul-rogers commented on a diff in the pull request:

    https://github.com/apache/drill/pull/638#discussion_r91447926
  
    --- Diff: 
contrib/storage-hive/core/src/main/codegen/templates/HiveRecordReaders.java ---
    @@ -0,0 +1,300 @@
    +/**
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +/**
    + * This template is used to generate different Hive record reader classes 
for different data formats
    + * to avoid JIT profile pullusion. These readers are derived from 
HiveAbstractReader which implements
    + * codes for init and setup stage, but the repeated - and performance 
critical part - next() method is
    + * separately implemented in the classes generated from this template. The 
internal SkipRecordReeader
    + * class is also separated as well due to the same reason.
    + *
    + * As to the performance gain with this change, please refer to:
    + * https://issues.apache.org/jira/browse/DRILL-4982
    + *
    + */
    +<@pp.dropOutputFile />
    +<#list hiveFormat.map as entry>
    +<@pp.changeOutputFile 
name="/org/apache/drill/exec/store/hive/Hive${entry.hiveReader}Reader.java" />
    +<#include "/@includes/license.ftl" />
    +
    +package org.apache.drill.exec.store.hive;
    +
    +import java.io.IOException;
    +import java.util.List;
    +import java.util.Properties;
    +import org.apache.drill.common.exceptions.DrillRuntimeException;
    +import org.apache.drill.common.exceptions.ExecutionSetupException;
    +import org.apache.drill.common.expression.SchemaPath;
    +import org.apache.drill.exec.ops.FragmentContext;
    +import org.apache.drill.exec.vector.AllocationHelper;
    +import org.apache.drill.exec.vector.ValueVector;
    +import org.apache.hadoop.hive.metastore.api.Partition;
    +import org.apache.hadoop.hive.metastore.api.Table;
    +import org.apache.hadoop.io.Writable;
    +import org.apache.hadoop.mapred.InputSplit;
    +import org.apache.hadoop.security.UserGroupInformation;
    +import org.apache.hadoop.hive.conf.HiveConf;
    +
    +import org.apache.hadoop.hive.serde2.SerDeException;
    +
    +import org.apache.hadoop.mapred.RecordReader;
    +<#if entry.hasHeaderFooter == true>
    +import org.apache.hadoop.hive.serde2.SerDe;
    +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
    +import com.google.common.collect.Lists;
    +import java.util.ArrayList;
    +import java.util.Arrays;
    +import java.util.HashSet;
    +import java.util.Queue;
    +import java.util.Set;
    +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
    +import org.apache.hadoop.hive.serde.serdeConstants;
    +</#if>
    +
    +public class Hive${entry.hiveReader}Reader extends HiveAbstractReader {
    +
    +  Object key;
    +<#if entry.hasHeaderFooter == true>
    +  SkipRecordsInspector skipRecordsInspector;
    +<#else>
    +  Object value;
    +</#if>
    +
    +  public Hive${entry.hiveReader}Reader(Table table, Partition partition, 
InputSplit inputSplit, List<SchemaPath> projectedColumns,
    +                       FragmentContext context, final HiveConf hiveConf,
    +                       UserGroupInformation proxyUgi) throws 
ExecutionSetupException {
    +    super(table, partition, inputSplit, projectedColumns, context, 
hiveConf, proxyUgi);
    +  }
    +
    +  public  void internalInit(Properties tableProperties, 
RecordReader<Object, Object> reader) {
    +
    +    key = reader.createKey();
    +<#if entry.hasHeaderFooter == true>
    +    skipRecordsInspector = new SkipRecordsInspector(tableProperties, 
reader);
    +<#else>
    +    value = reader.createValue();
    +</#if>
    +
    +  }
    +  private void readHiveRecordAndInsertIntoRecordBatch(Object 
deSerializedValue, int outputRecordIndex) {
    +    for (int i = 0; i < selectedStructFieldRefs.size(); i++) {
    +      Object hiveValue = finalOI.getStructFieldData(deSerializedValue, 
selectedStructFieldRefs.get(i));
    +      if (hiveValue != null) {
    +        
selectedColumnFieldConverters.get(i).setSafeValue(selectedColumnObjInspectors.get(i),
 hiveValue,
    +          vectors.get(i), outputRecordIndex);
    +      }
    +    }
    +  }
    +
    +<#if entry.hasHeaderFooter == true>
    +  @Override
    +  public int next() {
    +    for (ValueVector vv : vectors) {
    +      AllocationHelper.allocateNew(vv, TARGET_RECORD_COUNT);
    +    }
    +    if (empty) {
    +      setValueCountAndPopulatePartitionVectors(0);
    +      return 0;
    +    }
    +
    +    try {
    +      skipRecordsInspector.reset();
    +      Object value;
    +
    +      int recordCount = 0;
    +
    +      while (recordCount < TARGET_RECORD_COUNT && reader.next(key, value = 
skipRecordsInspector.getNextValue())) {
    +        if (skipRecordsInspector.doSkipHeader(recordCount++)) {
    +          continue;
    +        }
    +        Object bufferedValue = skipRecordsInspector.bufferAdd(value);
    +        if (bufferedValue != null) {
    +          Object deSerializedValue = partitionSerDe.deserialize((Writable) 
bufferedValue);
    +          if (partTblObjectInspectorConverter != null) {
    +            deSerializedValue = 
partTblObjectInspectorConverter.convert(deSerializedValue);
    +          }
    +          readHiveRecordAndInsertIntoRecordBatch(deSerializedValue, 
skipRecordsInspector.getActualCount());
    +          skipRecordsInspector.incrementActualCount();
    +        }
    +        skipRecordsInspector.incrementTempCount();
    +      }
    +
    +      
setValueCountAndPopulatePartitionVectors(skipRecordsInspector.getActualCount());
    +      skipRecordsInspector.updateContinuance();
    +      return skipRecordsInspector.getActualCount();
    +    } catch (IOException | SerDeException e) {
    +      throw new DrillRuntimeException(e);
    +    }
    +  }
    +
    +/**
    + * SkipRecordsInspector encapsulates logic to skip header and footer from 
file.
    + * Logic is applicable only for predefined in constructor file formats.
    + */
    +protected class SkipRecordsInspector {
    --- End diff --
    
    Is there anything different in this nested class across the generated 
classes? If not, Can you explain why we want multiple copies of the same code? 
A quick comment will help greatly.


> Hive Queries degrade when queries switch between different formats
> ------------------------------------------------------------------
>
>                 Key: DRILL-4982
>                 URL: https://issues.apache.org/jira/browse/DRILL-4982
>             Project: Apache Drill
>          Issue Type: Bug
>            Reporter: Chunhui Shi
>            Assignee: Karthikeyan Manivannan
>            Priority: Critical
>             Fix For: 1.10.0
>
>
> We have seen degraded performance by doing these steps:
> 1) generate the repro data:
> python script repro.py as below:
> import string
> import random
>  
> for i in range(30000000):
>     x1 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ 
> in range(random.randrange(19, 27)))
>     x2 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ 
> in range(random.randrange(19, 27)))
>     x3 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ 
> in range(random.randrange(19, 27)))
>     x4 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ 
> in range(random.randrange(19, 27)))
>     x5 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ 
> in range(random.randrange(19, 27)))
>     x6 = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ 
> in range(random.randrange(19, 27)))
>     print 
> "{0}".format(x1),"{0}".format(x2),"{0}".format(x3),"{0}".format(x4),"{0}".format(x5),"{0}".format(x6)
> python repro.py > repro.csv
> 2) put these files in a dfs directory e.g. '/tmp/hiveworkspace/plain'. Under 
> hive prompt, use the following sql command to create an external table:
> CREATE EXTERNAL TABLE `hiveworkspace`.`plain` (`id1` string, `id2` string, 
> `id3` string, `id4` string, `id5` string, `id6` string) ROW FORMAT SERDE 
> 'org.apache.hadoop.hive.serde2.OpenCSVSerde' STORED AS TEXTFILE LOCATION 
> '/tmp/hiveworkspace/plain'
> 3) create Hive's table of ORC|PARQUET format:
> CREATE TABLE `hiveworkspace`.`plainorc` STORED AS ORC AS SELECT 
> id1,id2,id3,id4,id5,id6 from `hiveworkspace`.`plain`;
> CREATE TABLE `hiveworkspace`.`plainparquet` STORED AS PARQUET AS SELECT 
> id1,id2,id3,id4,id5,id6 from `hiveworkspace`.`plain`;
> 4) Query switch between these two tables, then the query time on the same 
> table significantly lengthened. On my setup, for ORC, it was 15sec -> 26secs. 
> Queries on table of other formats, after injecting a query to other formats, 
> all have significant slow down.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

[jira] [Commented] (DRILL-4982) Hive Queries degrade when queries switch between different formats

Reply via email to