[GitHub] spark pull request #14102: [SPARK-16434][SQL] Avoid per-record type dispatch...

yhuai Sun, 17 Jul 2016 19:54:07 -0700

Github user yhuai commented on a diff in the pull request:

    https://github.com/apache/spark/pull/14102#discussion_r71096761
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
 ---
    @@ -35,184 +34,306 @@ import org.apache.spark.util.Utils
     
     private[json] class SparkSQLJsonProcessingException(msg: String) extends 
RuntimeException(msg)
     
    -object JacksonParser extends Logging {
    +private[sql] class JacksonParser(schema: StructType, options: JSONOptions) 
extends Logging {
    +  import com.fasterxml.jackson.core.JsonToken._
     
    -  def parse(
    -      input: RDD[String],
    -      schema: StructType,
    -      columnNameOfCorruptRecords: String,
    -      configOptions: JSONOptions): RDD[InternalRow] = {
    +  // A `ValueConverter` is responsible for converting a value from 
`JsonParser`
    +  // to a value in a field for `InternalRow`.
    +  private type ValueConverter = (JsonParser) => Any
     
    -    input.mapPartitions { iter =>
    -      parseJson(iter, schema, columnNameOfCorruptRecords, configOptions)
    +  // `ValueConverter`s for the root schema for all fields in the schema
    +  private val rootConverter: ValueConverter = makeRootConverter(schema)
    +
    +  private val factory = new JsonFactory()
    +  options.setJacksonOptions(factory)
    +
    +  private def failedConversion(
    +      parser: JsonParser,
    +      dataType: DataType): Any = parser.getCurrentToken match {
    +    case null | VALUE_NULL =>
    +      null
    +
    +    case _ if parser.getTextLength < 1 =>
    +      // guard the non string type
    +      null
    +
    +    case token =>
    +    // We cannot parse this token based on the given data type. So, we 
throw a
    +    // SparkSQLJsonProcessingException and this exception will be caught by
    +    // parseJson method.
    +    throw new SparkSQLJsonProcessingException(
    +      s"Failed to parse a value for data type $dataType (current token: 
$token).")
    +  }
    +
    +  private def failedRecord(record: String): Seq[InternalRow] = {
    +    // create a row even if no corrupt record column is present
    +    if (options.failFast) {
    +      throw new RuntimeException(s"Malformed line in FAILFAST mode: 
$record")
    +    }
    +    if (options.dropMalformed) {
    +      logWarning(s"Dropping malformed line: $record")
    +      Nil
    +    } else {
    +      val row = new GenericMutableRow(schema.length)
    +      for (corruptIndex <- 
schema.getFieldIndex(options.columnNameOfCorruptRecord)) {
    +        require(schema(corruptIndex).dataType == StringType)
    +        row.update(corruptIndex, UTF8String.fromString(record))
    +      }
    +      Seq(row)
         }
       }
     
       /**
    -   * Parse the current token (and related children) according to a desired 
schema
    -   * This is a wrapper for the method `convertField()` to handle a row 
wrapped
    -   * with an array.
    +   * Create a converter which converts the JSON documents held by the 
`JsonParser`
    +   * to a value according to a desired schema. This is a wrapper for the 
method
    +   * `makeConverter()` to handle a row wrapped with an array.
        */
    -  def convertRootField(
    -      factory: JsonFactory,
    -      parser: JsonParser,
    -      schema: DataType): Any = {
    -    import com.fasterxml.jackson.core.JsonToken._
    -    (parser.getCurrentToken, schema) match {
    -      case (START_ARRAY, st: StructType) =>
    -        // SPARK-3308: support reading top level JSON arrays and take 
every element
    -        // in such an array as a row
    -        convertArray(factory, parser, st)
    -
    -      case (START_OBJECT, ArrayType(st, _)) =>
    -        // the business end of SPARK-3308:
    -        // when an object is found but an array is requested just wrap it 
in a list
    -        convertField(factory, parser, st) :: Nil
    +  def makeRootConverter(dataType: DataType): ValueConverter = dataType 
match {
    +    case st: StructType =>
    +      // SPARK-3308: support reading top level JSON arrays and take every 
element
    +      // in such an array as a row
    +      val elementConverter = makeConverter(st)
    +      val fieldConverters = st.map(_.dataType).map(makeConverter)
    +      (parser: JsonParser) => parser.getCurrentToken match {
    +        case START_OBJECT => convertObject(parser, st, fieldConverters)
    +        case START_ARRAY => convertArray(parser, elementConverter)
    +        case _ => failedConversion(parser, st)
    +      }
     
    -      case _ =>
    -        convertField(factory, parser, schema)
    -    }
    +    case ArrayType(st: StructType, _) =>
    +      // the business end of SPARK-3308:
    +      // when an object is found but an array is requested just wrap it in 
a list
    +      val elementConverter = makeConverter(st)
    +      val fieldConverters = st.map(_.dataType).map(makeConverter)
    +      (parser: JsonParser) => parser.getCurrentToken match {
    +        case START_OBJECT => convertObject(parser, st, fieldConverters)
    +        case START_ARRAY => convertArray(parser, elementConverter)
    +        case _ => failedConversion(parser, st)
    +      }
    +
    +    case _ => makeConverter(dataType)
       }
     
    -  private def convertField(
    -      factory: JsonFactory,
    -      parser: JsonParser,
    -      schema: DataType): Any = {
    -    import com.fasterxml.jackson.core.JsonToken._
    -    (parser.getCurrentToken, schema) match {
    -      case (null | VALUE_NULL, _) =>
    -        null
    -
    -      case (FIELD_NAME, _) =>
    -        parser.nextToken()
    -        convertField(factory, parser, schema)
    -
    -      case (VALUE_STRING, StringType) =>
    -        UTF8String.fromString(parser.getText)
    -
    -      case (VALUE_STRING, _) if parser.getTextLength < 1 =>
    -        // guard the non string type
    -        null
    -
    -      case (VALUE_STRING, BinaryType) =>
    -        parser.getBinaryValue
    -
    -      case (VALUE_STRING, DateType) =>
    -        val stringValue = parser.getText
    -        if (stringValue.contains("-")) {
    -          // The format of this string will probably be "yyyy-mm-dd".
    -          
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(parser.getText).getTime)
    -        } else {
    -          // In Spark 1.5.0, we store the data as number of days since 
epoch in string.
    -          // So, we just convert it to Int.
    -          stringValue.toInt
    +  /**
    +   * Create a converter which converts the JSON documents held by the 
`JsonParser`
    +   * to a value according to a desired schema.
    +   */
    +  private def makeConverter(dataType: DataType): ValueConverter = dataType 
match {
    +    case BooleanType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_TRUE => true
    +          case VALUE_FALSE => false
    +          case _ => failedConversion(parser, dataType)
             }
    +      }
     
    -      case (VALUE_STRING, TimestampType) =>
    -        // This one will lose microseconds parts.
    -        // See https://issues.apache.org/jira/browse/SPARK-10681.
    -        DateTimeUtils.stringToTime(parser.getText).getTime * 1000L
    +    case ByteType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_NUMBER_INT => parser.getByteValue
    +          case _ => failedConversion(parser, dataType)
    +        }
    +      }
     
    -      case (VALUE_NUMBER_INT, TimestampType) =>
    -        parser.getLongValue * 1000000L
    +    case ShortType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_NUMBER_INT => parser.getShortValue
    +          case _ => failedConversion(parser, dataType)
    +        }
    +      }
     
    -      case (_, StringType) =>
    -        val writer = new ByteArrayOutputStream()
    -        Utils.tryWithResource(factory.createGenerator(writer, 
JsonEncoding.UTF8)) {
    -          generator => generator.copyCurrentStructure(parser)
    +    case IntegerType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_NUMBER_INT => parser.getIntValue
    +          case _ => failedConversion(parser, dataType)
             }
    -        UTF8String.fromBytes(writer.toByteArray)
    -
    -      case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, FloatType) =>
    -        parser.getFloatValue
    -
    -      case (VALUE_STRING, FloatType) =>
    -        // Special case handling for NaN and Infinity.
    -        val value = parser.getText
    -        val lowerCaseValue = value.toLowerCase()
    -        if (lowerCaseValue.equals("nan") ||
    -          lowerCaseValue.equals("infinity") ||
    -          lowerCaseValue.equals("-infinity") ||
    -          lowerCaseValue.equals("inf") ||
    -          lowerCaseValue.equals("-inf")) {
    -          value.toFloat
    -        } else {
    -          throw new SparkSQLJsonProcessingException(s"Cannot parse $value 
as FloatType.")
    +      }
    +
    +    case LongType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_NUMBER_INT => parser.getLongValue
    +          case _ => failedConversion(parser, dataType)
             }
    +      }
    +
    +    case FloatType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT =>
    +            parser.getFloatValue
    +
    +          case VALUE_STRING =>
    +            // Special case handling for NaN and Infinity.
    +            val value = parser.getText
    +            val lowerCaseValue = value.toLowerCase
    +            if (lowerCaseValue.equals("nan") ||
    +              lowerCaseValue.equals("infinity") ||
    +              lowerCaseValue.equals("-infinity") ||
    +              lowerCaseValue.equals("inf") ||
    +              lowerCaseValue.equals("-inf")) {
    +              value.toFloat
    +            } else {
    +              throw new SparkSQLJsonProcessingException(s"Cannot parse 
$value as FloatType.")
    +            }
     
    -      case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, DoubleType) =>
    -        parser.getDoubleValue
    -
    -      case (VALUE_STRING, DoubleType) =>
    -        // Special case handling for NaN and Infinity.
    -        val value = parser.getText
    -        val lowerCaseValue = value.toLowerCase()
    -        if (lowerCaseValue.equals("nan") ||
    -          lowerCaseValue.equals("infinity") ||
    -          lowerCaseValue.equals("-infinity") ||
    -          lowerCaseValue.equals("inf") ||
    -          lowerCaseValue.equals("-inf")) {
    -          value.toDouble
    -        } else {
    -          throw new SparkSQLJsonProcessingException(s"Cannot parse $value 
as DoubleType.")
    +          case _ => failedConversion(parser, dataType)
             }
    +      }
     
    -      case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, dt: DecimalType) =>
    -        Decimal(parser.getDecimalValue, dt.precision, dt.scale)
    +    case DoubleType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT =>
    +            parser.getDoubleValue
    +
    +          case VALUE_STRING =>
    +            // Special case handling for NaN and Infinity.
    +            val value = parser.getText
    +            val lowerCaseValue = value.toLowerCase
    +            if (lowerCaseValue.equals("nan") ||
    +              lowerCaseValue.equals("infinity") ||
    +              lowerCaseValue.equals("-infinity") ||
    +              lowerCaseValue.equals("inf") ||
    +              lowerCaseValue.equals("-inf")) {
    +              value.toDouble
    +            } else {
    +              throw new SparkSQLJsonProcessingException(s"Cannot parse 
$value as DoubleType.")
    +            }
     
    -      case (VALUE_NUMBER_INT, ByteType) =>
    -        parser.getByteValue
    +          case _ => failedConversion(parser, dataType)
    +        }
    +      }
     
    -      case (VALUE_NUMBER_INT, ShortType) =>
    -        parser.getShortValue
    +    case StringType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_STRING =>
    +            UTF8String.fromString(parser.getText)
     
    -      case (VALUE_NUMBER_INT, IntegerType) =>
    -        parser.getIntValue
    +          case token if token != VALUE_NULL =>
    +            val writer = new ByteArrayOutputStream()
    +            Utils.tryWithResource(factory.createGenerator(writer, 
JsonEncoding.UTF8)) {
    +              generator => generator.copyCurrentStructure(parser)
    +            }
    +            UTF8String.fromBytes(writer.toByteArray)
     
    -      case (VALUE_NUMBER_INT, LongType) =>
    -        parser.getLongValue
    +          case _ => failedConversion(parser, dataType)
    +        }
    +      }
     
    -      case (VALUE_TRUE, BooleanType) =>
    -        true
    +    case TimestampType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_STRING =>
    +            // This one will lose microseconds parts.
    +            // See https://issues.apache.org/jira/browse/SPARK-10681.
    +            DateTimeUtils.stringToTime(parser.getText).getTime * 1000L
     
    -      case (VALUE_FALSE, BooleanType) =>
    -        false
    +          case VALUE_NUMBER_INT =>
    +            parser.getLongValue * 1000000L
     
    -      case (START_OBJECT, st: StructType) =>
    -        convertObject(factory, parser, st)
    +          case _ => failedConversion(parser, dataType)
    +        }
    +      }
     
    -      case (START_ARRAY, ArrayType(st, _)) =>
    -        convertArray(factory, parser, st)
    +    case DateType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_STRING =>
    +            val stringValue = parser.getText
    +            if (stringValue.contains("-")) {
    +              // The format of this string will probably be "yyyy-mm-dd".
    +              
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(parser.getText).getTime)
    +            } else {
    +              // In Spark 1.5.0, we store the data as number of days since 
epoch in string.
    +              // So, we just convert it to Int.
    +              stringValue.toInt
    +            }
    +
    +          case _ => failedConversion(parser, dataType)
    +        }
    +      }
    +
    +    case BinaryType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case VALUE_STRING => parser.getBinaryValue
    +          case _ => failedConversion(parser, dataType)
    +        }
    +      }
    +
    +    case dt: DecimalType =>
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT) =>
    +            Decimal(parser.getDecimalValue, dt.precision, dt.scale)
    +
    +          case _ => failedConversion(parser, dt)
    +        }
    +      }
    +
    +    case st: StructType =>
    +      val fieldConverters = st.map(_.dataType).map(makeConverter)
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case START_OBJECT => convertObject(parser, st, fieldConverters)
    +          case _ => failedConversion(parser, st)
    +        }
    +      }
    +
    +    case at: ArrayType =>
    +      val elementConverter = makeConverter(at.elementType)
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case START_ARRAY => convertArray(parser, elementConverter)
    +          case _ => failedConversion(parser, at)
    +        }
    +      }
     
    -      case (START_OBJECT, MapType(StringType, kt, _)) =>
    -        convertMap(factory, parser, kt)
    +    case mt: MapType =>
    +      val valueConverter = makeConverter(mt.valueType)
    +      (parser: JsonParser) => skipFieldNameTokenIfExists(parser) {
    +        parser.getCurrentToken match {
    +          case START_OBJECT => convertMap(parser, valueConverter)
    +          case _ => failedConversion(parser, mt)
    +        }
    +      }
     
    -      case (_, udt: UserDefinedType[_]) =>
    -        convertField(factory, parser, udt.sqlType)
    +    case udt: UserDefinedType[_] =>
    +      makeConverter(udt.sqlType)
     
    -      case (token, dataType) =>
    -        // We cannot parse this token based on the given data type. So, we 
throw a
    -        // SparkSQLJsonProcessingException and this exception will be 
caught by
    -        // parseJson method.
    -        throw new SparkSQLJsonProcessingException(
    -          s"Failed to parse a value for data type $dataType (current 
token: $token).")
    +    case _ =>
    +      (parser: JsonParser) =>
    +        failedConversion(parser, dataType)
    +  }
    +
    +  private def skipFieldNameTokenIfExists(parser: JsonParser)(f: => Any): 
Any = {
    --- End diff --
    
    doc



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #14102: [SPARK-16434][SQL] Avoid per-record type dispatch...

Reply via email to