Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20796#discussion_r175215287
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
 ---
    @@ -1279,4 +1280,57 @@ class CSVSuite extends QueryTest with 
SharedSQLContext with SQLTestUtils {
           Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil
         )
       }
    +
    +  def testHandlingUTF8Char(utf8char: Array[Byte]): Unit = {
    +    val inHex = utf8char.map("%02x".format(_)).mkString("_")
    +    test(s"SPARK-23649: handle the first byte of the char: $inHex") {
    +      withTempPath { path =>
    +        def getBytes(str: String): Array[Byte] = {
    +          str.getBytes(StandardCharsets.UTF_8)
    +        }
    +        val filename = s"${path.getAbsolutePath}.csv"
    +        val header = getBytes("code,channel\n")
    +        val row = getBytes("ABGUN") ++ utf8char ++ getBytes(",United")
    +        val content = header ++ row
    +        Files.write(Paths.get(filename), content)
    --- End diff --
    
    we should specify encoding here, to avoid the case that different platform 
have different default values.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to