Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/16854#discussion_r104651799
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
---
@@ -134,23 +133,33 @@ object TextInputCSVDataSource extends CSVDataSource {
inputPaths: Seq[FileStatus],
parsedOptions: CSVOptions): Option[StructType] = {
val csv = createBaseDataset(sparkSession, inputPaths, parsedOptions)
- CSVUtils.filterCommentAndEmpty(csv, parsedOptions).take(1).headOption
match {
- case Some(firstLine) =>
- val firstRow = new
CsvParser(parsedOptions.asParserSettings).parseLine(firstLine)
- val caseSensitive =
sparkSession.sessionState.conf.caseSensitiveAnalysis
- val header = makeSafeHeader(firstRow, caseSensitive, parsedOptions)
- val tokenRDD = csv.rdd.mapPartitions { iter =>
- val filteredLines = CSVUtils.filterCommentAndEmpty(iter,
parsedOptions)
- val linesWithoutHeader =
- CSVUtils.filterHeaderLine(filteredLines, firstLine,
parsedOptions)
- val parser = new CsvParser(parsedOptions.asParserSettings)
- linesWithoutHeader.map(parser.parseLine)
- }
- Some(CSVInferSchema.infer(tokenRDD, header, parsedOptions))
- case None =>
- // If the first line could not be read, just return the empty
schema.
- Some(StructType(Nil))
- }
+ val maybeFirstLine = CSVUtils.filterCommentAndEmpty(csv,
parsedOptions).take(1).headOption
+ Some(inferFromDataset(sparkSession, csv, maybeFirstLine,
parsedOptions))
+ }
+
+ /**
+ * Infers the schema from `Dataset` that stores CSV string records.
+ */
+ def inferFromDataset(
--- End diff --
There is almost no code modification here. Just moved from above.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]