akshatshenoi-db commented on code in PR #56193:
URL: https://github.com/apache/spark/pull/56193#discussion_r3364592096
##########
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala:
##########
@@ -119,24 +124,26 @@ case class CSVFileFormat() extends TextBasedFileFormat
with DataSourceRegister {
dataSchema.filterNot(_.name ==
parsedOptions.columnNameOfCorruptRecord))
val actualRequiredSchema = StructType(
requiredSchema.filterNot(_.name ==
parsedOptions.columnNameOfCorruptRecord))
- val parser = new UnivocityParser(
- actualDataSchema,
- actualRequiredSchema,
- parsedOptions,
- actualFilters)
// Use column pruning when specified by Catalyst, except when one or
more columns have
// existence default value(s), since in that case we instruct the CSV
parser to disable column
// pruning and instead read each entire row in order to correctly assign
the default value(s).
val schema = if (isColumnPruningEnabled) actualRequiredSchema else
actualDataSchema
- val isStartOfFile = file.start == 0
- val headerChecker = new CSVHeaderChecker(
- schema, parsedOptions, source = s"CSV file: ${file.urlEncodedPath}",
isStartOfFile)
- CSVDataSource(parsedOptions).readFile(
- conf,
- file,
- parser,
- headerChecker,
- requiredSchema)
+
+ def newParser(): UnivocityParser =
+ new UnivocityParser(actualDataSchema, actualRequiredSchema,
parsedOptions, actualFilters)
+ def getHeaderChecker(isStartOfFile: Boolean, source: String):
CSVHeaderChecker =
+ new CSVHeaderChecker(schema, parsedOptions, source, isStartOfFile)
+
+ // A tar archive (always a single split, see `isSplitable`) is streamed
entry by entry when
+ // archive reads are enabled; otherwise the file is parsed directly.
+ if (parsedOptions.archiveFormatEnabled &&
ArchiveReader.isArchivePath(file.toPath)) {
+ CSVDataSource(parsedOptions).readArchive(
Review Comment:
will implement schema inference in a follow up pr
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]