[
https://issues.apache.org/jira/browse/HUDI-3197?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17471220#comment-17471220
]
sivabalan narayanan commented on HUDI-3197:
-------------------------------------------
Tested partition pruning via local docker set up.
spark datasource read
{code:java}
$SPARK_INSTALL/bin/spark-shell \
--jars $HUDI_SPARK_BUNDLE \
--master local[2] \
--driver-class-path $HADOOP_CONF_DIR \
--conf spark.sql.hive.convertMetastoreParquet=false \
--deploy-mode client \
--driver-memory 4G \
--executor-memory 3G \
--num-executors 1 \
--packages org.apache.spark:spark-avro_2.11:2.4.4
import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._val tableName =
"hudi_ny_tbl"val basePath = "hdfs:///tmp/hudi_ny_tbl/"
val df =
spark.read.format("parquet").load("file:///opt/parquet_dataset/parquet_ny_100_3/*")
df.write.format("hudi").
options(getQuickstartWriteConfigs).
option("hoodie.bulkinsert.shuffle.parallelism","20").
option(PRECOMBINE_FIELD_OPT_KEY, "tpep_pickup_datetime").
option(RECORDKEY_FIELD_OPT_KEY, "VendorID").
option(PARTITIONPATH_FIELD_OPT_KEY, "date_col").
option("hoodie.datasource.write.operation","bulk_insert").
option("write.parquet.max.file.size","31457280").
option("hoodie.parquet.block.size","31457280").
option("hoodie.metadata.enable","false").
option(TABLE_NAME, tableName).
mode(Overwrite).
save(basePath)
val tripsSnapshotDF = spark.
read.
format("hudi").
load(basePath)
//load(basePath) use "/partitionKey=partitionValue" folder structure for Spark
auto partition discovery
tripsSnapshotDF.createOrReplaceTempView("hudi_tbl")
{code}
all partition query: files read 485
spark.sql("select count(*) from hudi_tbl ").show()
!Screen Shot 2022-01-08 at 3.22.54 PM.png!
query with partition pruning : Files read 1
spark.sql("select count(*) from hudi_tbl where date_col = '2019-01-11' ").show()
!Screen Shot 2022-01-08 at 3.23.04 PM.png!
hive table read via spark.sql.
{code:java}
$SPARK_INSTALL/bin/spark-shell \
--jars $HUDI_SPARK_BUNDLE \
--master local[2] \
--driver-class-path $HADOOP_CONF_DIR \
--conf spark.sql.hive.convertMetastoreParquet=false \
--deploy-mode client \
--driver-memory 4G \
--executor-memory 3G \
--num-executors 1 \
--packages org.apache.spark:spark-avro_2.11:2.4.4
import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._val tableName =
"hudi_ny_tbl"val basePath = "hdfs:///tmp/hudi_ny_tbl/"
val df =
spark.read.format("parquet").load("file:///opt/parquet_dataset/parquet_ny_100_3/*")
df.write.format("hudi").
options(getQuickstartWriteConfigs).
option("hoodie.bulkinsert.shuffle.parallelism","20").
option(PRECOMBINE_FIELD_OPT_KEY, "tpep_pickup_datetime").
option(RECORDKEY_FIELD_OPT_KEY, "VendorID").
option(PARTITIONPATH_FIELD_OPT_KEY, "date_col").
option("hoodie.datasource.write.operation","bulk_insert").
option("write.parquet.max.file.size","31457280").
option("hoodie.parquet.block.size","31457280").
option("hoodie.metadata.enable","false").
option(TABLE_NAME, tableName).
option("hoodie.datasource.hive_sync.mode","hms").
option("hoodie.datasource.hive_sync.database","testdb").
option("hoodie.datasource.hive_sync.table","testtable3").
option("hoodie.datasource.hive_sync.partition_fields","date_col").
option("hoodie.datasource.hive_sync.enable","true").
option("hoodie.datasource.hive_sync.assume_date_partitioning","false").
option("hoodie.datasource.hive_sync.partition_extractor_class","org.apache.hudi.hive.MultiPartKeysValueExtractor").
mode(Overwrite).
save(basePath)
val tripsSnapshotDF = spark.
read.
format("hudi").
load(basePath)
//load(basePath) use "/partitionKey=partitionValue" folder structure for Spark
auto partition discovery
tripsSnapshotDF.createOrReplaceTempView("hudi_tbl")
{code}
query without any partition pruning : Files read 486
spark.sql("select count(*) from testdb.testtable3 ").show()
!Screen Shot 2022-01-08 at 3.26.13 PM.png!
query with partition pruning: Files read 1
!Screen Shot 2022-01-08 at 3.26.53 PM.png!
> Validate partition pruning with Spark SQL
> -----------------------------------------
>
> Key: HUDI-3197
> URL: https://issues.apache.org/jira/browse/HUDI-3197
> Project: Apache Hudi
> Issue Type: Task
> Reporter: Raymond Xu
> Assignee: Raymond Xu
> Priority: Major
> Attachments: Screen Shot 2022-01-08 at 3.22.54 PM.png, Screen Shot
> 2022-01-08 at 3.23.04 PM.png, Screen Shot 2022-01-08 at 3.26.13 PM.png,
> Screen Shot 2022-01-08 at 3.26.53 PM.png
>
>
--
This message was sent by Atlassian Jira
(v8.20.1#820001)