Github user liancheng commented on a diff in the pull request:
https://github.com/apache/spark/pull/14119#discussion_r70245522
--- Diff:
examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
---
@@ -41,43 +35,47 @@ object HiveFromSpark {
// in the current directory and creates a directory configured by
`spark.sql.warehouse.dir`,
// which defaults to the directory `spark-warehouse` in the current
directory that the spark
// application is started.
- val spark = SparkSession.builder
- .appName("HiveFromSpark")
- .enableHiveSupport()
- .getOrCreate()
+
+ // $example on:spark_hive$
+ // warehouseLocation points to the default location for managed
databases and tables
+ val warehouseLocation = "file:${system:user.dir}/spark-warehouse"
+
+ val spark = SparkSession
+ .builder()
+ .appName("Spark Hive Example")
+ .config("spark.sql.warehouse.dir", warehouseLocation)
+ .enableHiveSupport()
+ .getOrCreate()
import spark.implicits._
import spark.sql
sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
- sql(s"LOAD DATA LOCAL INPATH '${kv1File.getAbsolutePath}' INTO TABLE
src")
+ sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO
TABLE src")
// Queries are expressed in HiveQL
- println("Result of 'SELECT *': ")
- sql("SELECT * FROM src").collect().foreach(println)
+ sql("SELECT * FROM src").show()
// Aggregation queries are also supported.
- val count = sql("SELECT COUNT(*) FROM src").collect().head.getLong(0)
- println(s"COUNT(*): $count")
+ sql("SELECT COUNT(*) FROM src").show()
- // The results of SQL queries are themselves RDDs and support all
normal RDD functions. The
- // items in the RDD are of type Row, which allows you to access each
column by ordinal.
- val rddFromSql = sql("SELECT key, value FROM src WHERE key < 10 ORDER
BY key")
+ // The results of SQL queries are themselves DataFrames and support
all normal functions.
+ val sqlDF = sql("SELECT key, value FROM src WHERE key < 10 ORDER BY
key")
- println("Result of RDD.map:")
- val rddAsStrings = rddFromSql.rdd.map {
+ // The items in DaraFrames are of type Row, which allows you to access
each column by ordinal.
+ val stringsDS = sqlDF.map {
case Row(key: Int, value: String) => s"Key: $key, Value: $value"
}
+ stringsDS.show()
- // You can also use RDDs to create temporary views within a
HiveContext.
- val rdd = spark.sparkContext.parallelize((1 to 100).map(i => Record(i,
s"val_$i")))
- rdd.toDF().createOrReplaceTempView("records")
+ // You can also use DataFrames to create temporary views within a
HiveContext.
+ val recordsDF = spark.createDataFrame((1 to 100).map(i => Record(i,
s"val_$i")))
+ recordsDF.createOrReplaceTempView("records")
- // Queries can then join RDD data with data stored in Hive.
- println("Result of SELECT *:")
- sql("SELECT * FROM records r JOIN src s ON r.key =
s.key").collect().foreach(println)
+ // Queries can then join DataFrame data with data stored in Hive.
+ sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show()
+ // $example off:spark_hive$
spark.stop()
}
-}
-// scalastyle:on println
+}
--- End diff --
Nit: Please add newline at the end of the source file.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]