[GitHub] spark pull request #17315: [SPARK-19949][SQL] unify bad record handling in C...

HyukjinKwon Mon, 20 Mar 2017 01:53:29 -0700

Github user HyukjinKwon commented on a diff in the pull request:

    https://github.com/apache/spark/pull/17315#discussion_r106852242
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
 ---
    @@ -233,81 +187,39 @@ class UnivocityParser(
        * Parses a single CSV string and turns it into either one resulting row 
or no row (if the
        * the record is malformed).
        */
    -  def parse(input: String): Option[InternalRow] = 
convert(tokenizer.parseLine(input))
    -
    -  private def convert(tokens: Array[String]): Option[InternalRow] = {
    -    convertWithParseMode(tokens) { tokens =>
    -      var i: Int = 0
    -      while (i < tokenIndexArr.length) {
    -        // It anyway needs to try to parse since it decides if this row is 
malformed
    -        // or not after trying to cast in `DROPMALFORMED` mode even if the 
casted
    -        // value is not stored in the row.
    -        val from = tokenIndexArr(i)
    -        val to = rowIndexArr(i)
    -        val value = valueConverters(from).apply(tokens(from))
    -        if (i < requiredSchema.length) {
    -          row(to) = value
    -        }
    -        i += 1
    -      }
    -      row
    -    }
    -  }
    -
    -  private def convertWithParseMode(
    -      tokens: Array[String])(convert: Array[String] => InternalRow): 
Option[InternalRow] = {
    -    if (options.dropMalformed && dataSchema.length != tokens.length) {
    -      if (numMalformedRecords < options.maxMalformedLogPerPartition) {
    -        logWarning(s"Dropping malformed line: 
${tokens.mkString(options.delimiter.toString)}")
    -      }
    -      if (numMalformedRecords == options.maxMalformedLogPerPartition - 1) {
    -        logWarning(
    -          s"More than ${options.maxMalformedLogPerPartition} malformed 
records have been " +
    -            "found on this partition. Malformed records from now on will 
not be logged.")
    +  def parse(input: String): InternalRow = 
convert(tokenizer.parseLine(input))
    +
    +  private def convert(tokens: Array[String]): InternalRow = {
    +    if (tokens.length != schema.length) {
    +      // If the number of tokens doesn't match the schema, we should treat 
it as a malformed record.
    +      // However, we still have chance to parse some of the tokens, by 
adding extra null tokens in
    +      // the tail if the number is smaller, or by dropping extra tokens if 
the number is larger.
    +      val checkedTokens = if (schema.length > tokens.length) {
    +        tokens ++ new Array[String](schema.length - tokens.length)
    +      } else {
    +        tokens.take(schema.length)
           }
    -      numMalformedRecords += 1
    -      None
    -    } else if (options.failFast && dataSchema.length != tokens.length) {
    -      throw new RuntimeException(s"Malformed line in FAILFAST mode: " +
    -        s"${tokens.mkString(options.delimiter.toString)}")
    -    } else {
    -      // If a length of parsed tokens is not equal to expected one, it 
makes the length the same
    -      // with the expected. If the length is shorter, it adds extra tokens 
in the tail.
    -      // If longer, it drops extra tokens.
    -      //
    -      // TODO: Revisit this; if a length of tokens does not match an 
expected length in the schema,
    -      // we probably need to treat it as a malformed record.
    -      // See an URL below for related discussions:
    -      // https://github.com/apache/spark/pull/16928#discussion_r102657214
    -      val checkedTokens = if (options.permissive && dataSchema.length != 
tokens.length) {
    -        if (dataSchema.length > tokens.length) {
    -          tokens ++ new Array[String](dataSchema.length - tokens.length)
    -        } else {
    -          tokens.take(dataSchema.length)
    +      def getPartialResult(): Option[InternalRow] = {
    +        try {
    +          Some(convert(checkedTokens))
    +        } catch {
    +          case _: BadRecordException => None
             }
    -      } else {
    -        tokens
           }
    -
    +      throw BadRecordException(
    +        () => getCurrentInput(),
    +        getPartialResult,
    +        new RuntimeException("Malformed CSV record"))
    +    } else {
           try {
    -        Some(convert(checkedTokens))
    +        for (i <- requiredSchema.indices) {
    --- End diff --
    
    (it seems this one missed. I am fine if it is new one but I am worried of 
the case of changing from `while` to `for`. Might be not a really big deal 
though).



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #17315: [SPARK-19949][SQL] unify bad record handling in C...

Reply via email to