[GitHub] spark pull request #20937: [SPARK-23094][SPARK-23723][SPARK-23724][SQL] Supp...

HyukjinKwon Sat, 28 Apr 2018 20:22:51 -0700

Github user HyukjinKwon commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20937#discussion_r184870219
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 ---
    @@ -2167,4 +2171,241 @@ class JsonSuite extends QueryTest with 
SharedSQLContext with TestJsonData {
         val sampled = spark.read.option("samplingRatio", 1.0).json(ds)
         assert(sampled.count() == ds.count())
       }
    +
    +  test("SPARK-23723: json in UTF-16 with BOM") {
    +    val fileName = "test-data/utf16WithBOM.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .option("encoding", "UTF-16")
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird"), Row("Doug", "Rood")))
    +  }
    +
    +  test("SPARK-23723: multi-line json in UTF-32BE with BOM") {
    +    val fileName = "test-data/utf32BEWithBOM.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
    +  }
    +
    +  test("SPARK-23723: Use user's encoding in reading of multi-line json in 
UTF-16LE") {
    +    val fileName = "test-data/utf16LE.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .options(Map("encoding" -> "UTF-16LE"))
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
    +  }
    +
    +  test("SPARK-23723: Unsupported encoding name") {
    +    val invalidCharset = "UTF-128"
    +    val exception = intercept[UnsupportedCharsetException] {
    +      spark.read
    +        .options(Map("encoding" -> invalidCharset, "lineSep" -> "\n"))
    +        .json(testFile("test-data/utf16LE.json"))
    +        .count()
    +    }
    +
    +    assert(exception.getMessage.contains(invalidCharset))
    +  }
    +
    +  test("SPARK-23723: checking that the encoding option is case agnostic") {
    +    val fileName = "test-data/utf16LE.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .options(Map("encoding" -> "uTf-16lE"))
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
    +  }
    +
    +
    +  test("SPARK-23723: specified encoding is not matched to actual 
encoding") {
    +    val fileName = "test-data/utf16LE.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val exception = intercept[SparkException] {
    +      spark.read.schema(schema)
    +        .option("mode", "FAILFAST")
    +        .option("multiline", "true")
    +        .options(Map("encoding" -> "UTF-16BE"))
    +        .json(testFile(fileName))
    +        .count()
    +    }
    +    val errMsg = exception.getMessage
    +
    +    assert(errMsg.contains("Malformed records are detected in record 
parsing"))
    +  }
    +
    +  def checkEncoding(expectedEncoding: String, pathToJsonFiles: String,
    +      expectedContent: String): Unit = {
    --- End diff --
    
    I think it should be
    
    ```
    def checkEncoding(
        expectedEncoding: String,
        pathToJsonFiles: String,
        expectedContent: String): Unit = {
    ```
    
    per https://github.com/databricks/scala-style-guide#spacing-and-indentation
    
    or 
    
    ```
    def checkEncoding(
        expectedEncoding: String, pathToJsonFiles: String, expectedContent: 
String): Unit = {
    ```
    
    if it fits per 
https://github.com/databricks/scala-style-guide/issues/58#issue-243844040
    
    Not a big deal




---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request #20937: [SPARK-23094][SPARK-23723][SPARK-23724][SQL] Supp...

Reply via email to