Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/20937#discussion_r178427113
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
---
@@ -2065,29 +2065,238 @@ class JsonSuite extends QueryTest with
SharedSQLContext with TestJsonData {
}
}
- def testLineSeparator(lineSep: String): Unit = {
- test(s"SPARK-21289: Support line separator - lineSep: '$lineSep'") {
- // Read
- val data =
- s"""
- | {"f":
- |"a", "f0": 1}$lineSep{"f":
- |
- |"c", "f0": 2}$lineSep{"f": "d", "f0": 3}
- """.stripMargin
- val dataWithTrailingLineSep = s"$data$lineSep"
-
- Seq(data, dataWithTrailingLineSep).foreach { lines =>
- withTempPath { path =>
- Files.write(path.toPath, lines.getBytes(StandardCharsets.UTF_8))
- val df = spark.read.option("lineSep",
lineSep).json(path.getAbsolutePath)
- val expectedSchema =
- StructType(StructField("f", StringType) :: StructField("f0",
LongType) :: Nil)
- checkAnswer(df, Seq(("a", 1), ("c", 2), ("d", 3)).toDF())
- assert(df.schema === expectedSchema)
+ def testFile(fileName: String): String = {
+
Thread.currentThread().getContextClassLoader.getResource(fileName).toString
+ }
+
+ test("SPARK-23723: json in UTF-16 with BOM") {
+ val fileName = "json-tests/utf16WithBOM.json"
+ val schema = new StructType().add("firstName",
StringType).add("lastName", StringType)
+ val jsonDF = spark.read.schema(schema)
+ // This option will be replaced by .option("lineSep", "x00 0a")
+ // as soon as lineSep allows to specify sequence of bytes in
hexadecimal format.
+ .option("mode", "DROPMALFORMED")
+ .json(testFile(fileName))
+
+ checkAnswer(jsonDF, Seq(
+ Row("Chris", "Baird"), Row("Doug", "Rood")
+ ))
+ }
+
+ test("SPARK-23723: multi-line json in UTF-32BE with BOM") {
+ val fileName = "json-tests/utf32BEWithBOM.json"
+ val schema = new StructType().add("firstName",
StringType).add("lastName", StringType)
+ val jsonDF = spark.read.schema(schema)
+ .option("multiline", "true")
+ .json(testFile(fileName))
+
+ checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+ }
+
+ test("SPARK-23723: Use user's encoding in reading of multi-line json in
UTF-16LE") {
+ val fileName = "json-tests/utf16LE.json"
+ val schema = new StructType().add("firstName",
StringType).add("lastName", StringType)
+ val jsonDF = spark.read.schema(schema)
+ .option("multiline", "true")
+ .options(Map("encoding" -> "UTF-16LE"))
+ .json(testFile(fileName))
+
+ checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+ }
+
+ test("SPARK-23723: Unsupported charset name") {
+ val invalidCharset = "UTF-128"
+ val exception = intercept[java.io.UnsupportedEncodingException] {
+ spark.read
+ .options(Map("charset" -> invalidCharset, "lineSep" -> "\n"))
+ .json(testFile("json-tests/utf16LE.json"))
+ .count()
+ }
+
+ assert(exception.getMessage.contains(invalidCharset))
+ }
+
+ test("SPARK-23723: checking that the charset option is case agnostic") {
+ val fileName = "json-tests/utf16LE.json"
+ val schema = new StructType().add("firstName",
StringType).add("lastName", StringType)
+ val jsonDF = spark.read.schema(schema)
+ .option("multiline", "true")
+ .options(Map("charset" -> "uTf-16lE"))
+ .json(testFile(fileName))
+
+ checkAnswer(jsonDF, Seq(Row("Chris", "Baird")))
+ }
+
+
+ test("SPARK-23723: specified charset is not matched to actual charset") {
+ val fileName = "json-tests/utf16LE.json"
+ val schema = new StructType().add("firstName",
StringType).add("lastName", StringType)
+ val exception = intercept[SparkException] {
+ spark.read.schema(schema)
+ .option("mode", "FAILFAST")
+ .option("multiline", "true")
+ .options(Map("charset" -> "UTF-16BE"))
--- End diff --
You don't have to document 100% list. just best effort thing. We can fix
and add more encodings later if anyone found later, if it's hard to find out
every encoding supported.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]