Github user gengliangwang commented on a diff in the pull request:
https://github.com/apache/spark/pull/20894#discussion_r188841632
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
---
@@ -202,28 +263,33 @@ object TextInputCSVDataSource extends CSVDataSource {
object MultiLineCSVDataSource extends CSVDataSource {
override val isSplitable: Boolean = false
- override def readFile(
- conf: Configuration,
- file: PartitionedFile,
- parser: UnivocityParser,
- schema: StructType): Iterator[InternalRow] = {
+ override def readFile(conf: Configuration, file: PartitionedFile,
parser: UnivocityParser,
+ schema: StructType, dataSchema: StructType,
+ caseSensitive: Boolean): Iterator[InternalRow] = {
+ def checkHeader(header: Array[String]): Unit = {
+ CSVDataSource.checkHeaderColumnNames(dataSchema, header,
file.filePath,
+ checkHeaderFlag = !parser.options.enforceSchema, caseSensitive)
+ }
+
UnivocityParser.parseStream(
CodecStreams.createInputStreamWithCloseResource(conf, new Path(new
URI(file.filePath))),
- parser.options.headerFlag,
- parser,
- schema)
+ parser.options.headerFlag, parser, schema, checkHeader)
}
override def infer(
sparkSession: SparkSession,
inputPaths: Seq[FileStatus],
parsedOptions: CSVOptions): StructType = {
val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
+ // The header is not checked because there is no schema against with
it could be check
+ def checkHeader(header: Array[String]): Unit = ()
+
csv.flatMap { lines =>
val path = new Path(lines.getPath())
UnivocityParser.tokenizeStream(
CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path),
- shouldDropHeader = false,
+ dropFirstRecord = false,
+ checkHeader,
--- End diff --
No, I mean the one in line 302, is doing nothing.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]