[GitHub] spark pull request #14119: [SPARK-16303][DOCS][EXAMPLES][WIP] Updated SQL pr...

liancheng Mon, 11 Jul 2016 05:09:52 -0700

Github user liancheng commented on a diff in the pull request:

    https://github.com/apache/spark/pull/14119#discussion_r70245522
  
    --- Diff: 
examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
 ---
    @@ -41,43 +35,47 @@ object HiveFromSpark {
         // in the current directory and creates a directory configured by 
`spark.sql.warehouse.dir`,
         // which defaults to the directory `spark-warehouse` in the current 
directory that the spark
         // application is started.
    -    val spark = SparkSession.builder
    -      .appName("HiveFromSpark")
    -      .enableHiveSupport()
    -      .getOrCreate()
    +
    +    // $example on:spark_hive$
    +    // warehouseLocation points to the default location for managed 
databases and tables
    +    val warehouseLocation = "file:${system:user.dir}/spark-warehouse"
    +
    +    val spark = SparkSession
    +        .builder()
    +        .appName("Spark Hive Example")
    +        .config("spark.sql.warehouse.dir", warehouseLocation)
    +        .enableHiveSupport()
    +        .getOrCreate()
     
         import spark.implicits._
         import spark.sql
     
         sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
    -    sql(s"LOAD DATA LOCAL INPATH '${kv1File.getAbsolutePath}' INTO TABLE 
src")
    +    sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO 
TABLE src")
     
         // Queries are expressed in HiveQL
    -    println("Result of 'SELECT *': ")
    -    sql("SELECT * FROM src").collect().foreach(println)
    +    sql("SELECT * FROM src").show()
     
         // Aggregation queries are also supported.
    -    val count = sql("SELECT COUNT(*) FROM src").collect().head.getLong(0)
    -    println(s"COUNT(*): $count")
    +    sql("SELECT COUNT(*) FROM src").show()
     
    -    // The results of SQL queries are themselves RDDs and support all 
normal RDD functions.  The
    -    // items in the RDD are of type Row, which allows you to access each 
column by ordinal.
    -    val rddFromSql = sql("SELECT key, value FROM src WHERE key < 10 ORDER 
BY key")
    +    // The results of SQL queries are themselves DataFrames and support 
all normal functions.
    +    val sqlDF = sql("SELECT key, value FROM src WHERE key < 10 ORDER BY 
key")
     
    -    println("Result of RDD.map:")
    -    val rddAsStrings = rddFromSql.rdd.map {
    +    // The items in DaraFrames are of type Row, which allows you to access 
each column by ordinal.
    +    val stringsDS = sqlDF.map {
           case Row(key: Int, value: String) => s"Key: $key, Value: $value"
         }
    +    stringsDS.show()
     
    -    // You can also use RDDs to create temporary views within a 
HiveContext.
    -    val rdd = spark.sparkContext.parallelize((1 to 100).map(i => Record(i, 
s"val_$i")))
    -    rdd.toDF().createOrReplaceTempView("records")
    +    // You can also use DataFrames to create temporary views within a 
HiveContext.
    +    val recordsDF = spark.createDataFrame((1 to 100).map(i => Record(i, 
s"val_$i")))
    +    recordsDF.createOrReplaceTempView("records")
     
    -    // Queries can then join RDD data with data stored in Hive.
    -    println("Result of SELECT *:")
    -    sql("SELECT * FROM records r JOIN src s ON r.key = 
s.key").collect().foreach(println)
    +    // Queries can then join DataFrame data with data stored in Hive.
    +    sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show()
    +    // $example off:spark_hive$
     
         spark.stop()
       }
    -}
    -// scalastyle:on println
    +}
    --- End diff --
    
    Nit: Please add newline at the end of the source file.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #14119: [SPARK-16303][DOCS][EXAMPLES][WIP] Updated SQL pr...

Reply via email to