Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/20796#discussion_r174355311
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
---
@@ -1279,4 +1279,22 @@ class CSVSuite extends QueryTest with
SharedSQLContext with SQLTestUtils {
Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil
)
}
+
+ test("skip the first byte of a char if it is disallowed in UTF-8") {
+ val df = spark.read
+ .format("csv")
+ .option("header", "true")
+ .load(testFile("test-data/utf8xFF.csv"))
+ val expectedSchema = new StructType()
+ .add("channel", StringType)
+ .add("code", StringType)
+
+ assert(df.schema == expectedSchema)
+
+ val badStr = new String("ABGUN".getBytes :+ 0xff.toByte)
--- End diff --
Shall we explicitly give encoding?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]