[
https://issues.apache.org/jira/browse/SPARK-11087?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14993771#comment-14993771
]
patcharee edited comment on SPARK-11087 at 11/6/15 2:56 PM:
------------------------------------------------------------
Hi, I found a scenario where the predicate does not work again. [~zzhan] Can
you please have a look?
First create a hive table >>
hive> create table people(name string, address string, phone string)
partitioned by(age int) stored as orc;
Then use spark shell local mode to insert data and then query >>
120 import org.apache.spark.sql.Row
121 import org.apache.spark.{SparkConf, SparkContext}
122 import org.apache.spark.sql.types._
123 import
org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType,FloatType}
124 sqlContext.setConf("hive.exec.dynamic.partition.mode","nonstrict")
125 val records = (1 to 10).map( i => Row(s"name_$i", s"address_$i",
s"phone_$i", i ))
126 val schemaString = "name address phone age"
127 val schema = StructType(schemaString.split(" ").map(fieldName => if
(fieldName.equals("age")) StructField(fieldName, IntegerType, true) else
StructField(fieldName, StringType, true)))
128 val x = sc.parallelize(records)
129 val rDF = sqlContext.createDataFrame(x, schema)
130
rDF.write.format("org.apache.spark.sql.hive.orc.DefaultSource").mode(org.apache.spark.sql.SaveMode.Append).partitionBy("age").saveAsTable("people")
131 sqlContext.setConf("spark.sql.orc.filterPushdown", "true")
132 val people =
sqlContext.read.format("orc").load("/user/hive/warehouse/people")
133 people.registerTempTable("people")
134 sqlContext.sql("SELECT * FROM people WHERE age = 3 and name =
'name_3'").count
Below is a part of the log message from the last command >>
15/11/06 15:40:36 INFO HadoopRDD: Input split:
hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000:0+453
15/11/06 15:40:36 DEBUG OrcInputFormat: No ORC pushdown predicate
15/11/06 15:40:36 DEBUG OrcInputFormat: No ORC pushdown predicate
15/11/06 15:40:36 INFO OrcRawRecordMerger: min key = null, max key = null
15/11/06 15:40:36 INFO OrcRawRecordMerger: min key = null, max key = null
15/11/06 15:40:36 INFO ReaderImpl: Reading ORC rows from
hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000 with
{include: [true, true, false, false], offset: 0, length: 9223372036854775807}
15/11/06 15:40:36 INFO ReaderImpl: Reading ORC rows from
hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000 with
{include: [true, true, false, false], offset: 0, length: 9223372036854775807}
15/11/06 15:40:36 INFO RecordReaderFactory: Schema is not specified on read.
Using file schema.
15/11/06 15:40:36 INFO RecordReaderFactory: Schema is not specified on read.
Using file schema.
15/11/06 15:40:36 DEBUG RecordReaderImpl: chunks = [range start: 111 end: 126]
15/11/06 15:40:36 DEBUG RecordReaderImpl: chunks = [range start: 111 end: 126]
15/11/06 15:40:36 DEBUG RecordReaderImpl: merge = [data range [111, 126), size:
15 type: array-backed]
15/11/06 15:40:36 DEBUG RecordReaderImpl: merge = [data range [111, 126), size:
15 type: array-backed]
15/11/06 15:40:36 INFO GeneratePredicate: Code generated in 5.063287 ms
was (Author: patcharee):
Hi, I found a scenario where the predicate does not work again. [~zzhan] Can
you please have a look?
First create a hive table >>
hive> create table people(name string, address string, phone string)
partitioned by(age int) stored as orc;
Then use spark shell local mode to insert data and then query >>
120 import org.apache.spark.sql.Row
121 import org.apache.spark.{SparkConf, SparkContext}
122 import org.apache.spark.sql.types._
123 import
org.apache.spark.sql.types.{StructType,StructField,StringType,IntegerType,FloatType}
124 sqlContext.setConf("hive.exec.dynamic.partition.mode","nonstrict")
125 val records = (1 to 10).map( i => Row(s"name_$i", s"address_$i",
s"phone_$i", i ))
126 val schemaString = "name address phone age"
127 val schema = StructType(schemaString.split(" ").map(fieldName => if
(fieldName.equals("age")) StructField(fieldName, IntegerType, true) else
StructField(fieldName, StringType, true)))
128 val x = sc.parallelize(records)
129 val rDF = sqlContext.createDataFrame(x, schema)
130
rDF.write.format("org.apache.spark.sql.hive.orc.DefaultSource").mode(org.apache.spark.sql.SaveMode.Append).partitionBy("age").saveAsTable("people")
131 sqlContext.setConf("spark.sql.orc.filterPushdown", "true")
132 val people =
sqlContext.read.format("orc").load("/user/hive/warehouse/people")
133 people.registerTempTable("people")
134 sqlContext.sql("SELECT * FROM people WHERE age = 3 and name =
'name_3'").count
15/11/06 15:40:36 INFO HadoopRDD: Input split:
hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000:0+453
15/11/06 15:40:36 DEBUG OrcInputFormat: No ORC pushdown predicate
15/11/06 15:40:36 DEBUG OrcInputFormat: No ORC pushdown predicate
15/11/06 15:40:36 INFO OrcRawRecordMerger: min key = null, max key = null
15/11/06 15:40:36 INFO OrcRawRecordMerger: min key = null, max key = null
15/11/06 15:40:36 INFO ReaderImpl: Reading ORC rows from
hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000 with
{include: [true, true, false, false], offset: 0, length: 9223372036854775807}
15/11/06 15:40:36 INFO ReaderImpl: Reading ORC rows from
hdfs://localhost:9000/user/hive/warehouse/people/age=3/part-00000 with
{include: [true, true, false, false], offset: 0, length: 9223372036854775807}
15/11/06 15:40:36 INFO RecordReaderFactory: Schema is not specified on read.
Using file schema.
15/11/06 15:40:36 INFO RecordReaderFactory: Schema is not specified on read.
Using file schema.
15/11/06 15:40:36 DEBUG RecordReaderImpl: chunks = [range start: 111 end: 126]
15/11/06 15:40:36 DEBUG RecordReaderImpl: chunks = [range start: 111 end: 126]
15/11/06 15:40:36 DEBUG RecordReaderImpl: merge = [data range [111, 126), size:
15 type: array-backed]
15/11/06 15:40:36 DEBUG RecordReaderImpl: merge = [data range [111, 126), size:
15 type: array-backed]
15/11/06 15:40:36 INFO GeneratePredicate: Code generated in 5.063287 ms
> spark.sql.orc.filterPushdown does not work, No ORC pushdown predicate
> ---------------------------------------------------------------------
>
> Key: SPARK-11087
> URL: https://issues.apache.org/jira/browse/SPARK-11087
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 1.5.1
> Environment: orc file version 0.12 with HIVE_8732
> hive version 1.2.1.2.3.0.0-2557
> Reporter: patcharee
> Priority: Minor
>
> I have an external hive table stored as partitioned orc file (see the table
> schema below). I tried to query from the table with where clause>
> hiveContext.setConf("spark.sql.orc.filterPushdown", "true")
> hiveContext.sql("select u, v from 4D where zone = 2 and x = 320 and y =
> 117")).
> But from the log file with debug logging level on, the ORC pushdown predicate
> was not generated.
> Unfortunately my table was not sorted when I inserted the data, but I
> expected the ORC pushdown predicate should be generated (because of the where
> clause) though
> Table schema
> ================================
> hive> describe formatted 4D;
> OK
> # col_name data_type comment
>
> date int
> hh int
> x int
> y int
> height float
> u float
> v float
> w float
> ph float
> phb float
> t float
> p float
> pb float
> qvapor float
> qgraup float
> qnice float
> qnrain float
> tke_pbl float
> el_pbl float
> qcloud float
>
> # Partition Information
> # col_name data_type comment
>
> zone int
> z int
> year int
> month int
>
> # Detailed Table Information
> Database: default
> Owner: patcharee
> CreateTime: Thu Jul 09 16:46:54 CEST 2015
> LastAccessTime: UNKNOWN
> Protect Mode: None
> Retention: 0
> Location: hdfs://helmhdfs/apps/hive/warehouse/wrf_tables/4D
>
> Table Type: EXTERNAL_TABLE
> Table Parameters:
> EXTERNAL TRUE
> comment this table is imported from rwf_data/*/wrf/*
> last_modified_by patcharee
> last_modified_time 1439806692
> orc.compress ZLIB
> transient_lastDdlTime 1439806692
>
> # Storage Information
> SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde
> InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
> OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
>
> Compressed: No
> Num Buckets: -1
> Bucket Columns: []
> Sort Columns: []
> Storage Desc Params:
> serialization.format 1
> Time taken: 0.388 seconds, Fetched: 58 row(s)
> ================================
> Data was inserted into this table by another spark job>
> df.write.format("org.apache.spark.sql.hive.orc.DefaultSource").mode(org.apache.spark.sql.SaveMode.Append).partitionBy("zone","z","year","month").saveAsTable("4D")
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]