[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...

HyukjinKwon Sat, 31 Mar 2018 01:58:59 -0700

Github user HyukjinKwon commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20937#discussion_r178427113
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 ---
    @@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with 
SharedSQLContext with TestJsonData {
         }
       }
     
    -  def testLineSeparator(lineSep: String): Unit = {
    -    test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") {
    -      // Read
    -      val data =
    -        s"""
    -          |  {"f":
    -          |"a", "f0": 1}$lineSep{"f":
    -          |
    -          |"c",  "f0": 2}$lineSep{"f": "d",  "f0": 3}
    -        """.stripMargin
    -      val dataWithTrailingLineSep = s"$data$lineSep"
    -
    -      Seq(data, dataWithTrailingLineSep).foreach { lines =>
    -        withTempPath { path =>
    -          Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8))
    -          val df = spark.read.option("lineSep", 
lineSep).json(path.getAbsolutePath)
    -          val expectedSchema =
    -            StructType(StructField("f", StringType) :: StructField("f0", 
LongType) :: Nil)
    -          checkAnswer(df, Seq(("a", 1), ("c", 2), ("d", 3)).toDF())
    -          assert(df.schema === expectedSchema)
    +  def testFile(fileName: String): String = {
    +    
Thread.currentThread().getContextClassLoader.getResource(fileName).toString
    +  }
    +
    +  test("SPARK-23723: json in UTF-16 with BOM") {
    +    val fileName = "json-tests/utf16WithBOM.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      // This option will be replaced by .option("lineSep", "x00 0a")
    +      // as soon as lineSep allows to specify sequence of bytes in 
hexadecimal format.
    +      .option("mode", "DROPMALFORMED")
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(
    +      Row("Chris", "Baird"), Row("Doug", "Rood")
    +    ))
    +  }
    +
    +  test("SPARK-23723: multi-line json in UTF-32BE with BOM") {
    +    val fileName = "json-tests/utf32BEWithBOM.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
    +  }
    +
    +  test("SPARK-23723: Use user's encoding in reading of multi-line json in 
UTF-16LE") {
    +    val fileName = "json-tests/utf16LE.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .options(Map("encoding" -> "UTF-16LE"))
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
    +  }
    +
    +  test("SPARK-23723: Unsupported charset name") {
    +    val invalidCharset = "UTF-128"
    +    val exception = intercept[java.io.UnsupportedEncodingException] {
    +      spark.read
    +        .options(Map("charset" -> invalidCharset, "lineSep" -> "\n"))
    +        .json(testFile("json-tests/utf16LE.json"))
    +        .count()
    +    }
    +
    +    assert(exception.getMessage.contains(invalidCharset))
    +  }
    +
    +  test("SPARK-23723: checking that the charset option is case agnostic") {
    +    val fileName = "json-tests/utf16LE.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val jsonDF = spark.read.schema(schema)
    +      .option("multiline", "true")
    +      .options(Map("charset" -> "uTf-16lE"))
    +      .json(testFile(fileName))
    +
    +    checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
    +  }
    +
    +
    +  test("SPARK-23723: specified charset is not matched to actual charset") {
    +    val fileName = "json-tests/utf16LE.json"
    +    val schema = new StructType().add("firstName", 
StringType).add("lastName", StringType)
    +    val exception = intercept[SparkException] {
    +      spark.read.schema(schema)
    +        .option("mode", "FAILFAST")
    +        .option("multiline", "true")
    +        .options(Map("charset" -> "UTF-16BE"))
    --- End diff --
    
    You don't have to document 100% list. just best effort thing. We can fix 
and add more encodings later if anyone found later, if it's hard to find out 
every encoding supported.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #20937: [SPARK-23723][SPARK-23724][SQL] Support custom en...

Reply via email to