Github user HyukjinKwon commented on a diff in the pull request: https://github.com/apache/spark/pull/20018#discussion_r158407910 --- Diff: examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala --- @@ -102,8 +101,41 @@ object SparkHiveExample { // | 4| val_4| 4| val_4| // | 5| val_5| 5| val_5| // ... - // $example off:spark_hive$ + // Create Hive managed table with parquet + sql("CREATE TABLE records(key int, value string) STORED AS PARQUET") + // Save DataFrame to Hive Managed table as Parquet format + val hiveTableDF = sql("SELECT * FROM records") + hiveTableDF.write.mode(SaveMode.Overwrite).saveAsTable("database_name.records") + // Create External Hive table with parquet + sql("CREATE EXTERNAL TABLE records(key int, value string) " + + "STORED AS PARQUET LOCATION '/user/hive/warehouse/'") + // to make Hive parquet format compatible with spark parquet format + spark.sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true") + + // Multiple parquet files could be created accordingly to volume of data under directory given. + val hiveExternalTableLocation = "/user/hive/warehouse/database_name.db/records" + + // Save DataFrame to Hive External table as compatible parquet format + hiveTableDF.write.mode(SaveMode.Overwrite).parquet(hiveExternalTableLocation) + + // turn on flag for Dynamic Partitioning + spark.sqlContext.setConf("hive.exec.dynamic.partition", "true") + spark.sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict") + + // You can create partitions in Hive table, so downstream queries run much faster. + hiveTableDF.write.mode(SaveMode.Overwrite).partitionBy("key") + .parquet(hiveExternalTableLocation) + + // reduce number of files for each partition by repartition --- End diff -- `reduce` -> `Reduce`.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org