Github user maropu commented on the issue:
https://github.com/apache/spark/pull/21070
ok, I got the reason why;
```
$ ./bin/spark-shell --master=local[1]
scala> import scala.util.Random
scala> :paste
def timer[R](f: => {}): Unit = {
val count = 5
val iters = (0 until count).map { i =>
val t0 = System.nanoTime()
f
val t1 = System.nanoTime()
val elapsed = t1 - t0 + 0.0
println(s"#$i: ${elapsed / 1000000000.0}")
elapsed
}
println("Avg. Elapsed Time: " + ((iters.sum / count) / 1000000000.0) +
"s")
}
var dir = "/home/ec2-user/parquet-test-string"
val numRows = 1024 * 1024 * 15
val width = 6
val selectExpr = (1 to width).map(i => s"CAST(value AS STRING) c$i")
val valueCol = monotonically_increasing_id().cast("string")
val df = spark.range(numRows).map(_ =>
Random.nextLong).selectExpr(selectExpr: _*).withColumn("value",
valueCol).sort("value")
df.write.mode("overwrite").parquet(dir)
spark.read.parquet(dir).createOrReplaceTempView("parquetTable")
scala> sql("SET spark.sql.parquet.filterPushdown=true")
scala> timer { sql("select * from parquetTable where value = '0'").collect }
#0: 1.041495043
#1: 0.53017502
#2: 0.468505896
#3: 0.437655119
#4: 0.429151435
Avg. Elapsed Time: 0.5813965026s
scala> sql("select * from parquetTable where value = 0").explain
== Physical Plan ==
*(1) Project [c1#35, c2#36, c3#37, c4#38, c5#39, c6#40, value#41]
+- *(1) Filter (isnotnull(value#41) && (cast(value#41 as int) = 0))
+- *(1) FileScan parquet [c1#35,c2#36,c3#37,c4#38,c5#39,c6#40,value#41]
Batched: true, Format: Parquet, Location:
InMemoryFileIndex[file:/home/ec2-user/parquet-test-string], PartitionFilters:
[], PushedFilters: [IsNotNull(value)], ReadSchema:
struct<c1:string,c2:string,c3:string,c4:string,c5:string,c6:string,value:string>
scala> timer { sql("select * from parquetTable where value = 0").collect }
#0: 10.656159769
#1: 10.583965537
#2: 10.486018192
#3: 10.475532898
#4: 10.494059857
Avg. Elapsed Time: 10.539147250600001s
scala> sql("select * from parquetTable where value = '0'").explain
== Physical Plan ==
*(1) Project [c1#35, c2#36, c3#37, c4#38, c5#39, c6#40, value#41]
+- *(1) Filter (isnotnull(value#41) && (value#41 = 0))
+- *(1) FileScan parquet [c1#35,c2#36,c3#37,c4#38,c5#39,c6#40,value#41]
Batched: true, Format: Parquet, Location:
InMemoryFileIndex[file:/home/ec2-user/parquet-test-string], PartitionFilters:
[], PushedFilters: [IsNotNull(value), EqualTo(value,0)], ReadSchema:
struct<c1:string,c2:string,c3:string,c4:string,c5:string,c6:string,value:string>
```
We do push down the predicate `value = '0'` into the parquet filter though,
we don't push down the predicate `value = 0`. So, this is a spark-side issue
for push-down handling.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]