Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/20796#discussion_r175215287
--- Diff:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
---
@@ -1279,4 +1280,57 @@ class CSVSuite extends QueryTest with
SharedSQLContext with SQLTestUtils {
Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil
)
}
+
+ def testHandlingUTF8Char(utf8char: Array[Byte]): Unit = {
+ val inHex = utf8char.map("%02x".format(_)).mkString("_")
+ test(s"SPARK-23649: handle the first byte of the char: $inHex") {
+ withTempPath { path =>
+ def getBytes(str: String): Array[Byte] = {
+ str.getBytes(StandardCharsets.UTF_8)
+ }
+ val filename = s"${path.getAbsolutePath}.csv"
+ val header = getBytes("code,channel\n")
+ val row = getBytes("ABGUN") ++ utf8char ++ getBytes(",United")
+ val content = header ++ row
+ Files.write(Paths.get(filename), content)
--- End diff --
we should specify encoding here, to avoid the case that different platform
have different default values.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]