[GitHub] spark pull request #14102: [SPARK-16434][SQL] Avoid per-record type dispatch...

cloud-fan Thu, 11 Aug 2016 00:54:50 -0700

Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/14102#discussion_r74380391
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
 ---
    @@ -35,184 +34,296 @@ import org.apache.spark.util.Utils
     
     private[json] class SparkSQLJsonProcessingException(msg: String) extends 
RuntimeException(msg)
     
    -object JacksonParser extends Logging {
    +class JacksonParser(
    +    schema: StructType,
    +    columnNameOfCorruptRecord: String,
    +    options: JSONOptions) extends Logging {
     
    -  def parse(
    -      input: RDD[String],
    -      schema: StructType,
    -      columnNameOfCorruptRecords: String,
    -      configOptions: JSONOptions): RDD[InternalRow] = {
    +  import com.fasterxml.jackson.core.JsonToken._
    +
    +  // A `ValueConverter` is responsible for converting a value from 
`JsonParser`
    +  // to a value in a field for `InternalRow`.
    +  private type ValueConverter = (JsonParser) => Any
    +
    +  // `ValueConverter`s for the root schema for all fields in the schema
    +  private val rootConverter: ValueConverter = makeRootConverter(schema)
     
    -    input.mapPartitions { iter =>
    -      parseJson(iter, schema, columnNameOfCorruptRecords, configOptions)
    +  private val factory = new JsonFactory()
    +  options.setJacksonOptions(factory)
    +
    +  /**
    +   * This function deals with the cases it fails to parse. This function 
will be called
    +   * when exceptions are caught during converting. This functions also 
deals with `mode` option.
    +   */
    +  private def failedRecord(record: String): Seq[InternalRow] = {
    +    // create a row even if no corrupt record column is present
    +    if (options.failFast) {
    +      throw new RuntimeException(s"Malformed line in FAILFAST mode: 
$record")
    +    }
    +    if (options.dropMalformed) {
    +      logWarning(s"Dropping malformed line: $record")
    +      Nil
    +    } else {
    +      val row = new GenericMutableRow(schema.length)
    +      for (corruptIndex <- 
schema.getFieldIndex(columnNameOfCorruptRecord)) {
    +        require(schema(corruptIndex).dataType == StringType)
    +        row.update(corruptIndex, UTF8String.fromString(record))
    +      }
    +      Seq(row)
         }
       }
     
       /**
    -   * Parse the current token (and related children) according to a desired 
schema
    -   * This is a wrapper for the method `convertField()` to handle a row 
wrapped
    -   * with an array.
    +   * This function will be called afterward except the case for 
`StringType`. we
    +   * throw an exception when it is failed unless the value is null.
        */
    -  def convertRootField(
    -      factory: JsonFactory,
    +  private def failedConversion(
           parser: JsonParser,
    -      schema: DataType): Any = {
    -    import com.fasterxml.jackson.core.JsonToken._
    -    (parser.getCurrentToken, schema) match {
    -      case (START_ARRAY, st: StructType) =>
    -        // SPARK-3308: support reading top level JSON arrays and take 
every element
    -        // in such an array as a row
    -        convertArray(factory, parser, st)
    -
    -      case (START_OBJECT, ArrayType(st, _)) =>
    +      dataType: DataType): PartialFunction[JsonToken, Any] = {
    +    case VALUE_STRING if parser.getTextLength < 1 =>
    +      // If conversion is failed, this produces `null` rather than
    +      // rather than throw exception. This will protect the mismatch of 
types.
    +      null
    +
    +    case token =>
    +      // We cannot parse this token based on the given data type. So, we 
throw a
    +      // SparkSQLJsonProcessingException and this exception will be caught 
by
    +      // `parse` method.
    +      throw new SparkSQLJsonProcessingException(
    +        s"Failed to parse a value for data type $dataType (current token: 
$token).")
    +  }
    +
    +  /**
    +   * Create a converter which converts the JSON documents held by the 
`JsonParser`
    +   * to a value according to a desired schema. This is a wrapper for the 
method
    +   * `makeConverter()` to handle a row wrapped with an array.
    +   */
    +  def makeRootConverter(dataType: DataType): ValueConverter = dataType 
match {
    --- End diff --
    
    the previous `convertRootField` only handles the special cases: 
(START_ARRAY, StructType) and (START_OBJECT, ArrayType). Can we follow that 
logic here?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #14102: [SPARK-16434][SQL] Avoid per-record type dispatch...

Reply via email to