Github user HyukjinKwon commented on a diff in the pull request:
https://github.com/apache/spark/pull/17315#discussion_r106852242
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
---
@@ -233,81 +187,39 @@ class UnivocityParser(
* Parses a single CSV string and turns it into either one resulting row
or no row (if the
* the record is malformed).
*/
- def parse(input: String): Option[InternalRow] =
convert(tokenizer.parseLine(input))
-
- private def convert(tokens: Array[String]): Option[InternalRow] = {
- convertWithParseMode(tokens) { tokens =>
- var i: Int = 0
- while (i < tokenIndexArr.length) {
- // It anyway needs to try to parse since it decides if this row is
malformed
- // or not after trying to cast in `DROPMALFORMED` mode even if the
casted
- // value is not stored in the row.
- val from = tokenIndexArr(i)
- val to = rowIndexArr(i)
- val value = valueConverters(from).apply(tokens(from))
- if (i < requiredSchema.length) {
- row(to) = value
- }
- i += 1
- }
- row
- }
- }
-
- private def convertWithParseMode(
- tokens: Array[String])(convert: Array[String] => InternalRow):
Option[InternalRow] = {
- if (options.dropMalformed && dataSchema.length != tokens.length) {
- if (numMalformedRecords < options.maxMalformedLogPerPartition) {
- logWarning(s"Dropping malformed line:
${tokens.mkString(options.delimiter.toString)}")
- }
- if (numMalformedRecords == options.maxMalformedLogPerPartition - 1) {
- logWarning(
- s"More than ${options.maxMalformedLogPerPartition} malformed
records have been " +
- "found on this partition. Malformed records from now on will
not be logged.")
+ def parse(input: String): InternalRow =
convert(tokenizer.parseLine(input))
+
+ private def convert(tokens: Array[String]): InternalRow = {
+ if (tokens.length != schema.length) {
+ // If the number of tokens doesn't match the schema, we should treat
it as a malformed record.
+ // However, we still have chance to parse some of the tokens, by
adding extra null tokens in
+ // the tail if the number is smaller, or by dropping extra tokens if
the number is larger.
+ val checkedTokens = if (schema.length > tokens.length) {
+ tokens ++ new Array[String](schema.length - tokens.length)
+ } else {
+ tokens.take(schema.length)
}
- numMalformedRecords += 1
- None
- } else if (options.failFast && dataSchema.length != tokens.length) {
- throw new RuntimeException(s"Malformed line in FAILFAST mode: " +
- s"${tokens.mkString(options.delimiter.toString)}")
- } else {
- // If a length of parsed tokens is not equal to expected one, it
makes the length the same
- // with the expected. If the length is shorter, it adds extra tokens
in the tail.
- // If longer, it drops extra tokens.
- //
- // TODO: Revisit this; if a length of tokens does not match an
expected length in the schema,
- // we probably need to treat it as a malformed record.
- // See an URL below for related discussions:
- // https://github.com/apache/spark/pull/16928#discussion_r102657214
- val checkedTokens = if (options.permissive && dataSchema.length !=
tokens.length) {
- if (dataSchema.length > tokens.length) {
- tokens ++ new Array[String](dataSchema.length - tokens.length)
- } else {
- tokens.take(dataSchema.length)
+ def getPartialResult(): Option[InternalRow] = {
+ try {
+ Some(convert(checkedTokens))
+ } catch {
+ case _: BadRecordException => None
}
- } else {
- tokens
}
-
+ throw BadRecordException(
+ () => getCurrentInput(),
+ getPartialResult,
+ new RuntimeException("Malformed CSV record"))
+ } else {
try {
- Some(convert(checkedTokens))
+ for (i <- requiredSchema.indices) {
--- End diff --
(it seems this one missed. I am fine if it is new one but I am worried of
the case of changing from `while` to `for`. Might be not a really big deal
though).
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]