Github user maropu commented on a diff in the pull request:
https://github.com/apache/spark/pull/21657#discussion_r202528933
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
---
@@ -38,24 +38,29 @@ class UnivocityParser(
requiredSchema: StructType,
val options: CSVOptions) extends Logging {
require(requiredSchema.toSet.subsetOf(dataSchema.toSet),
- "requiredSchema should be the subset of schema.")
+ s"requiredSchema (${requiredSchema.catalogString}) should be the
subset of " +
+ s"dataSchema (${dataSchema.catalogString}).")
def this(schema: StructType, options: CSVOptions) = this(schema, schema,
options)
// A `ValueConverter` is responsible for converting the given value to a
desired type.
private type ValueConverter = String => Any
+ // This index is used to reorder parsed tokens
+ private val tokenIndexArr =
+ requiredSchema.map(f =>
java.lang.Integer.valueOf(dataSchema.indexOf(f))).toArray
+
val tokenizer = {
val parserSetting = options.asParserSettings
if (options.columnPruning && requiredSchema.length <
dataSchema.length) {
- val tokenIndexArr = requiredSchema.map(f =>
java.lang.Integer.valueOf(dataSchema.indexOf(f)))
parserSetting.selectIndexes(tokenIndexArr: _*)
}
new CsvParser(parserSetting)
}
- private val schema = if (options.columnPruning) requiredSchema else
dataSchema
- private val row = new GenericInternalRow(schema.length)
+ private val parsedSchema = if (options.columnPruning) requiredSchema
else dataSchema
--- End diff --
ok
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]