[GitHub] spark pull request #13727: [SPARK-15982][SPARK-16009][SPARK-16007][SQL] Harm...

gatorsmile Tue, 21 Jun 2016 19:51:12 -0700

Github user gatorsmile commented on a diff in the pull request:

    https://github.com/apache/spark/pull/13727#discussion_r67987384
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
 ---
    @@ -228,4 +222,152 @@ class DataFrameReaderWriterSuite extends QueryTest 
with SharedSQLContext {
           }
         }
       }
    +
    +  test("load API") {
    +    spark.read.format("org.apache.spark.sql.test").load()
    +    spark.read.format("org.apache.spark.sql.test").load(dir)
    +    spark.read.format("org.apache.spark.sql.test").load(dir, dir, dir)
    +    spark.read.format("org.apache.spark.sql.test").load(Seq(dir, dir): _*)
    +    Option(dir).map(spark.read.format("org.apache.spark.sql.test").load)
    +  }
    +
    +  test("text - API and behavior regarding schema") {
    +    // Writer
    +    spark.createDataset(data).write.mode(SaveMode.Overwrite).text(dir)
    +    testRead(spark.read.text(dir), data, textSchema)
    +
    +    // Reader, without user specified schema
    +    testRead(spark.read.text(), Seq.empty, textSchema)
    +    testRead(spark.read.text(dir, dir, dir), data ++ data ++ data, 
textSchema)
    +    testRead(spark.read.text(Seq(dir, dir): _*), data ++ data, textSchema)
    +    // Test explicit calls to single arg method - SPARK-16009
    +    testRead(Option(dir).map(spark.read.text).get, data, textSchema)
    +
    +    // Reader, with user specified schema, should just apply user schema 
on the file data
    +    testRead(spark.read.schema(userSchema).text(), Seq.empty, userSchema)
    +    testRead(spark.read.schema(userSchema).text(dir), data, userSchema)
    +    testRead(spark.read.schema(userSchema).text(dir, dir), data ++ data, 
userSchema)
    +    testRead(spark.read.schema(userSchema).text(Seq(dir, dir): _*), data 
++ data, userSchema)
    +  }
    +
    +  test("textFile - API and behavior regarding schema") {
    +    spark.createDataset(data).write.mode(SaveMode.Overwrite).text(dir)
    +
    +    // Reader, without user specified schema
    +    testRead(spark.read.textFile().toDF(), Seq.empty, textSchema)
    +    testRead(spark.read.textFile(dir).toDF(), data, textSchema)
    +    testRead(spark.read.textFile(dir, dir).toDF(), data ++ data, 
textSchema)
    +    testRead(spark.read.textFile(Seq(dir, dir): _*).toDF(), data ++ data, 
textSchema)
    +    // Test explicit calls to single arg method - SPARK-16009
    +    testRead(Option(dir).map(spark.read.text).get, data, textSchema)
    +
    +    // Reader, with user specified schema, should just apply user schema 
on the file data
    +    val e = intercept[AnalysisException] { 
spark.read.schema(userSchema).textFile() }
    +    assert(e.getMessage.toLowerCase.contains("user specified schema not 
supported"))
    +    intercept[AnalysisException] { 
spark.read.schema(userSchema).textFile(dir) }
    +    intercept[AnalysisException] { 
spark.read.schema(userSchema).textFile(dir, dir) }
    +    intercept[AnalysisException] { 
spark.read.schema(userSchema).textFile(Seq(dir, dir): _*) }
    +  }
    +
    +  test("csv - API and behavior regarding schema") {
    +    // Writer
    +    
spark.createDataset(data).toDF("str").write.mode(SaveMode.Overwrite).csv(dir)
    +    val df = spark.read.csv(dir)
    +    checkAnswer(df, spark.createDataset(data).toDF())
    +    val schema = df.schema
    +
    +    // Reader, without user specified schema
    +    intercept[IllegalArgumentException] {
    +      testRead(spark.read.csv(), Seq.empty, schema)
    +    }
    +    testRead(spark.read.csv(dir), data, schema)
    +    testRead(spark.read.csv(dir, dir), data ++ data, schema)
    +    testRead(spark.read.csv(Seq(dir, dir): _*), data ++ data, schema)
    +    // Test explicit calls to single arg method - SPARK-16009
    +    testRead(Option(dir).map(spark.read.csv).get, data, schema)
    +
    +    // Reader, with user specified schema, should just apply user schema 
on the file data
    +    testRead(spark.read.schema(userSchema).csv(), Seq.empty, userSchema)
    +    testRead(spark.read.schema(userSchema).csv(dir), data, userSchema)
    +    testRead(spark.read.schema(userSchema).csv(dir, dir), data ++ data, 
userSchema)
    +    testRead(spark.read.schema(userSchema).csv(Seq(dir, dir): _*), data ++ 
data, userSchema)
    +  }
    +
    +  test("json - API and behavior regarding schema") {
    +    // Writer
    +    
spark.createDataset(data).toDF("str").write.mode(SaveMode.Overwrite).json(dir)
    +    val df = spark.read.json(dir)
    +    checkAnswer(df, spark.createDataset(data).toDF())
    +    val schema = df.schema
    +
    +    // Reader, without user specified schema
    +    intercept[AnalysisException] {
    +      testRead(spark.read.json(), Seq.empty, schema)
    +    }
    +    testRead(spark.read.json(dir), data, schema)
    +    testRead(spark.read.json(dir, dir), data ++ data, schema)
    +    testRead(spark.read.json(Seq(dir, dir): _*), data ++ data, schema)
    +    // Test explicit calls to single arg method - SPARK-16009
    +    testRead(Option(dir).map(spark.read.json).get, data, schema)
    +
    +    // Reader, with user specified schema, data should be nulls as schema 
in file different
    +    // from user schema
    +    val expData = Seq[String](null, null, null)
    +    testRead(spark.read.schema(userSchema).json(), Seq.empty, userSchema)
    +    testRead(spark.read.schema(userSchema).json(dir), expData, userSchema)
    +    testRead(spark.read.schema(userSchema).json(dir, dir), expData ++ 
expData, userSchema)
    +    testRead(spark.read.schema(userSchema).json(Seq(dir, dir): _*), 
expData ++ expData, userSchema)
    +  }
    +
    +  test("parquet - API and behavior regarding schema") {
    +    // Writer
    +    
spark.createDataset(data).toDF("str").write.mode(SaveMode.Overwrite).parquet(dir)
    +    val df = spark.read.parquet(dir)
    +    checkAnswer(df, spark.createDataset(data).toDF())
    +    val schema = df.schema
    +
    +    // Reader, without user specified schema
    +    intercept[AnalysisException] {
    +      testRead(spark.read.parquet(), Seq.empty, schema)
    +    }
    +    testRead(spark.read.parquet(dir), data, schema)
    +    testRead(spark.read.parquet(dir, dir), data ++ data, schema)
    +    testRead(spark.read.parquet(Seq(dir, dir): _*), data ++ data, schema)
    +    // Test explicit calls to single arg method - SPARK-16009
    +    testRead(Option(dir).map(spark.read.parquet).get, data, schema)
    +
    +    // Reader, with user specified schema, data should be nulls as schema 
in file different
    +    // from user schema
    +    val expData = Seq[String](null, null, null)
    +    testRead(spark.read.schema(userSchema).parquet(), Seq.empty, 
userSchema)
    +    testRead(spark.read.schema(userSchema).parquet(dir), expData, 
userSchema)
    --- End diff --
    
    @tdas ORC behaves differently. When the user-specified schema does not 
match the physical schema, it simply stops and reports an exception. Do you 
think that behavior is better than returning `null` for all the rows?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #13727: [SPARK-15982][SPARK-16009][SPARK-16007][SQL] Harm...

Reply via email to