Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/21657#discussion_r199033773
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
---
@@ -131,20 +132,30 @@ class CSVFileFormat extends TextBasedFileFormat with
DataSourceRegister {
)
}
val caseSensitive =
sparkSession.sessionState.conf.caseSensitiveAnalysis
+ val columnPruning = sparkSession.sessionState.conf.csvColumnPruning
(file: PartitionedFile) => {
val conf = broadcastedHadoopConf.value.value
val parser = new UnivocityParser(
StructType(dataSchema.filterNot(_.name ==
parsedOptions.columnNameOfCorruptRecord)),
StructType(requiredSchema.filterNot(_.name ==
parsedOptions.columnNameOfCorruptRecord)),
parsedOptions)
- CSVDataSource(parsedOptions).readFile(
+ val inputRows = CSVDataSource(parsedOptions).readFile(
conf,
file,
parser,
requiredSchema,
dataSchema,
caseSensitive)
+
+ if (columnPruning) {
+ inputRows
+ } else {
+ val inputAttrs = dataSchema.toAttributes
--- End diff --
I think we better just merge this into the master since we already added
some changes related with column pruning stuff. Let me double check it before
merging it in.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]