Github user MaxGekk commented on a diff in the pull request:

    https://github.com/apache/spark/pull/22374#discussion_r217279960
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
 ---
    @@ -240,23 +240,25 @@ object TextInputCSVDataSource extends CSVDataSource {
           sparkSession: SparkSession,
           csv: Dataset[String],
           maybeFirstLine: Option[String],
    -      parsedOptions: CSVOptions): StructType = maybeFirstLine match {
    -    case Some(firstLine) =>
    -      val firstRow = new 
CsvParser(parsedOptions.asParserSettings).parseLine(firstLine)
    -      val caseSensitive = 
sparkSession.sessionState.conf.caseSensitiveAnalysis
    -      val header = makeSafeHeader(firstRow, caseSensitive, parsedOptions)
    -      val sampled: Dataset[String] = CSVUtils.sample(csv, parsedOptions)
    -      val tokenRDD = sampled.rdd.mapPartitions { iter =>
    -        val filteredLines = CSVUtils.filterCommentAndEmpty(iter, 
parsedOptions)
    -        val linesWithoutHeader =
    -          CSVUtils.filterHeaderLine(filteredLines, firstLine, 
parsedOptions)
    -        val parser = new CsvParser(parsedOptions.asParserSettings)
    -        linesWithoutHeader.map(parser.parseLine)
    -      }
    -      CSVInferSchema.infer(tokenRDD, header, parsedOptions)
    -    case None =>
    -      // If the first line could not be read, just return the empty schema.
    -      StructType(Nil)
    +      parsedOptions: CSVOptions): StructType = {
    +    val csvParser = new CsvParser(parsedOptions.asParserSettings)
    +    maybeFirstLine.map(csvParser.parseLine(_)) match {
    +      case Some(firstRow) if firstRow != null =>
    +        val caseSensitive = 
sparkSession.sessionState.conf.caseSensitiveAnalysis
    +        val header = makeSafeHeader(firstRow, caseSensitive, parsedOptions)
    +        val sampled: Dataset[String] = CSVUtils.sample(csv, parsedOptions)
    +        val tokenRDD = sampled.rdd.mapPartitions { iter =>
    +          val filteredLines = CSVUtils.filterCommentAndEmpty(iter, 
parsedOptions)
    +          val linesWithoutHeader =
    +            CSVUtils.filterHeaderLine(filteredLines, maybeFirstLine.get, 
parsedOptions)
    +          val parser = new CsvParser(parsedOptions.asParserSettings)
    +          linesWithoutHeader.map(parser.parseLine)
    +        }
    +        CSVInferSchema.infer(tokenRDD, header, parsedOptions)
    --- End diff --
    
    @HyukjinKwon I have checked this on (with header too):
    ```Scala
        val input = spark.createDataset(Seq("1", "\u0000\u0000\u0001234"))
    
        val df = spark.read.option("inferSchema", true).csv(input)
        df.printSchema()
        df.show()
    ```
    ```
    root
     |-- _c0: integer (nullable = true)
    
    +----+
    | _c0|
    +----+
    |   1|
    |null|
    +----+
    ```
    In the debugger, I didn't observe `null` in 
https://github.com/apache/spark/blob/5264164a67df498b73facae207eda12ee133be7d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala#L61-L69
 .


---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to