Github user chetkhatri commented on a diff in the pull request:
https://github.com/apache/spark/pull/20018#discussion_r158454282
--- Diff:
examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
---
@@ -102,8 +101,41 @@ object SparkHiveExample {
// | 4| val_4| 4| val_4|
// | 5| val_5| 5| val_5|
// ...
- // $example off:spark_hive$
+ // Create Hive managed table with parquet
+ sql("CREATE TABLE records(key int, value string) STORED AS PARQUET")
+ // Save DataFrame to Hive Managed table as Parquet format
+ val hiveTableDF = sql("SELECT * FROM records")
+
hiveTableDF.write.mode(SaveMode.Overwrite).saveAsTable("database_name.records")
+ // Create External Hive table with parquet
+ sql("CREATE EXTERNAL TABLE records(key int, value string) " +
+ "STORED AS PARQUET LOCATION '/user/hive/warehouse/'")
+ // to make Hive parquet format compatible with spark parquet format
+ spark.sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")
+
+ // Multiple parquet files could be created accordingly to volume of
data under directory given.
+ val hiveExternalTableLocation =
"/user/hive/warehouse/database_name.db/records"
+
+ // Save DataFrame to Hive External table as compatible parquet format
+
hiveTableDF.write.mode(SaveMode.Overwrite).parquet(hiveExternalTableLocation)
+
+ // turn on flag for Dynamic Partitioning
+ spark.sqlContext.setConf("hive.exec.dynamic.partition", "true")
+ spark.sqlContext.setConf("hive.exec.dynamic.partition.mode",
"nonstrict")
+
+ // You can create partitions in Hive table, so downstream queries run
much faster.
+ hiveTableDF.write.mode(SaveMode.Overwrite).partitionBy("key")
+ .parquet(hiveExternalTableLocation)
+
+ // reduce number of files for each partition by repartition
--- End diff --
@HyukjinKwon Thanks for highlight, improved the same.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]