Github user MaxGekk commented on a diff in the pull request:
https://github.com/apache/spark/pull/22374#discussion_r217279960
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
---
@@ -240,23 +240,25 @@ object TextInputCSVDataSource extends CSVDataSource {
sparkSession: SparkSession,
csv: Dataset[String],
maybeFirstLine: Option[String],
- parsedOptions: CSVOptions): StructType = maybeFirstLine match {
- case Some(firstLine) =>
- val firstRow = new
CsvParser(parsedOptions.asParserSettings).parseLine(firstLine)
- val caseSensitive =
sparkSession.sessionState.conf.caseSensitiveAnalysis
- val header = makeSafeHeader(firstRow, caseSensitive, parsedOptions)
- val sampled: Dataset[String] = CSVUtils.sample(csv, parsedOptions)
- val tokenRDD = sampled.rdd.mapPartitions { iter =>
- val filteredLines = CSVUtils.filterCommentAndEmpty(iter,
parsedOptions)
- val linesWithoutHeader =
- CSVUtils.filterHeaderLine(filteredLines, firstLine,
parsedOptions)
- val parser = new CsvParser(parsedOptions.asParserSettings)
- linesWithoutHeader.map(parser.parseLine)
- }
- CSVInferSchema.infer(tokenRDD, header, parsedOptions)
- case None =>
- // If the first line could not be read, just return the empty schema.
- StructType(Nil)
+ parsedOptions: CSVOptions): StructType = {
+ val csvParser = new CsvParser(parsedOptions.asParserSettings)
+ maybeFirstLine.map(csvParser.parseLine(_)) match {
+ case Some(firstRow) if firstRow != null =>
+ val caseSensitive =
sparkSession.sessionState.conf.caseSensitiveAnalysis
+ val header = makeSafeHeader(firstRow, caseSensitive, parsedOptions)
+ val sampled: Dataset[String] = CSVUtils.sample(csv, parsedOptions)
+ val tokenRDD = sampled.rdd.mapPartitions { iter =>
+ val filteredLines = CSVUtils.filterCommentAndEmpty(iter,
parsedOptions)
+ val linesWithoutHeader =
+ CSVUtils.filterHeaderLine(filteredLines, maybeFirstLine.get,
parsedOptions)
+ val parser = new CsvParser(parsedOptions.asParserSettings)
+ linesWithoutHeader.map(parser.parseLine)
+ }
+ CSVInferSchema.infer(tokenRDD, header, parsedOptions)
--- End diff --
@HyukjinKwon I have checked this on (with header too):
```Scala
val input = spark.createDataset(Seq("1", "\u0000\u0000\u0001234"))
val df = spark.read.option("inferSchema", true).csv(input)
df.printSchema()
df.show()
```
```
root
|-- _c0: integer (nullable = true)
+----+
| _c0|
+----+
| 1|
|null|
+----+
```
In the debugger, I didn't observe `null` in
https://github.com/apache/spark/blob/5264164a67df498b73facae207eda12ee133be7d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala#L61-L69
.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]