[GitHub] spark pull request: [SPARK-6777] [SQL] Implements backwards compat...

rdblue Tue, 14 Jul 2015 11:01:48 -0700

Github user rdblue commented on a diff in the pull request:

    https://github.com/apache/spark/pull/6617#discussion_r34598625
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala
 ---
    @@ -0,0 +1,565 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.parquet
    +
    +import scala.collection.JavaConversions._
    +
    +import org.apache.hadoop.conf.Configuration
    +import org.apache.parquet.schema.OriginalType._
    +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
    +import org.apache.parquet.schema.Type.Repetition._
    +import org.apache.parquet.schema._
    +
    +import org.apache.spark.sql.types._
    +import org.apache.spark.sql.{AnalysisException, SQLConf}
    +
    +/**
    + * This converter class is used to convert Parquet [[MessageType]] to 
Spark SQL [[StructType]] and
    + * vice versa.
    + *
    + * Parquet format backwards-compatibility rules are respected when 
converting Parquet
    + * [[MessageType]] schemas.
    + *
    + * @see 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
    + *
    + * @constructor
    + * @param assumeBinaryIsString Whether unannotated BINARY fields should be 
assumed to be Spark SQL
    + *        [[StringType]] fields when converting Parquet a [[MessageType]] 
to Spark SQL
    + *        [[StructType]].
    + * @param assumeInt96IsTimestamp Whether unannotated INT96 fields should 
be assumed to be Spark SQL
    + *        [[TimestampType]] fields when converting Parquet a 
[[MessageType]] to Spark SQL
    + *        [[StructType]].  Note that Spark SQL [[TimestampType]] is 
similar to Hive timestamp, which
    + *        has optional nanosecond precision, but different from 
`TIME_MILLS` and `TIMESTAMP_MILLIS`
    + *        described in Parquet format spec.
    + * @param followParquetFormatSpec Whether to generate standard DECIMAL, 
LIST, and MAP structure when
    + *        converting Spark SQL [[StructType]] to Parquet [[MessageType]].  
For Spark 1.4.x and
    + *        prior versions, Spark SQL only supports decimals with a max 
precision of 18 digits, and
    + *        uses non-standard LIST and MAP structure.  Note that the current 
Parquet format spec is
    + *        backwards-compatible with these settings.  If this argument is 
set to `false`, we fallback
    + *        to old style non-standard behaviors.
    + */
    +private[parquet] class CatalystSchemaConverter(
    +    private val assumeBinaryIsString: Boolean,
    +    private val assumeInt96IsTimestamp: Boolean,
    +    private val followParquetFormatSpec: Boolean) {
    +
    +  // Only used when constructing converter for converting Spark SQL schema 
to Parquet schema, in
    +  // which case `assumeInt96IsTimestamp` and `assumeBinaryIsString` are 
irrelevant.
    +  def this() = this(
    +    assumeBinaryIsString = 
SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get,
    +    assumeInt96IsTimestamp = 
SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get,
    +    followParquetFormatSpec = 
SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get)
    +
    +  def this(conf: SQLConf) = this(
    +    assumeBinaryIsString = conf.isParquetBinaryAsString,
    +    assumeInt96IsTimestamp = conf.isParquetINT96AsTimestamp,
    +    followParquetFormatSpec = conf.followParquetFormatSpec)
    +
    +  def this(conf: Configuration) = this(
    +    assumeBinaryIsString =
    +      conf.getBoolean(
    +        SQLConf.PARQUET_BINARY_AS_STRING.key,
    +        SQLConf.PARQUET_BINARY_AS_STRING.defaultValue.get),
    +    assumeInt96IsTimestamp =
    +      conf.getBoolean(
    +        SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
    +        SQLConf.PARQUET_INT96_AS_TIMESTAMP.defaultValue.get),
    +    followParquetFormatSpec =
    +      conf.getBoolean(
    +        SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.key,
    +        SQLConf.PARQUET_FOLLOW_PARQUET_FORMAT_SPEC.defaultValue.get))
    +
    +  /**
    +   * Converts Parquet [[MessageType]] `parquetSchema` to a Spark SQL 
[[StructType]].
    +   */
    +  def convert(parquetSchema: MessageType): StructType = 
convert(parquetSchema.asGroupType())
    +
    +  private def convert(parquetSchema: GroupType): StructType = {
    +    val fields = parquetSchema.getFields.map { field =>
    +      field.getRepetition match {
    +        case OPTIONAL =>
    +          StructField(field.getName, convertField(field), nullable = true)
    +
    +        case REQUIRED =>
    +          StructField(field.getName, convertField(field), nullable = false)
    +
    +        case REPEATED =>
    +          throw new AnalysisException(
    +            s"REPEATED not supported outside LIST or MAP. Type: $field")
    +      }
    +    }
    +
    +    StructType(fields)
    +  }
    +
    +  /**
    +   * Converts a Parquet [[Type]] to a Spark SQL [[DataType]].
    +   */
    +  def convertField(parquetType: Type): DataType = parquetType match {
    +    case t: PrimitiveType => convertPrimitiveField(t)
    +    case t: GroupType => convertGroupField(t.asGroupType())
    +  }
    +
    +  private def convertPrimitiveField(field: PrimitiveType): DataType = {
    +    val typeName = field.getPrimitiveTypeName
    +    val originalType = field.getOriginalType
    +
    +    def typeString =
    +      if (originalType == null) s"$typeName" else s"$typeName 
($originalType)"
    +
    +    def typeNotImplemented() =
    +      throw new AnalysisException(s"Parquet type not yet supported: 
$typeString")
    +
    +    def illegalType() =
    +      throw new AnalysisException(s"Illegal Parquet type: $typeString")
    +
    +    // When maxPrecision = -1, we skip precision range check, and always 
respect the precision
    +    // specified in field.getDecimalMetadata.  This is useful when 
interpreting decimal types stored
    +    // as binaries with variable lengths.
    +    def makeDecimalType(maxPrecision: Int = -1): DecimalType = {
    +      val precision = field.getDecimalMetadata.getPrecision
    +      val scale = field.getDecimalMetadata.getScale
    +
    +      CatalystSchemaConverter.analysisRequire(
    +        maxPrecision == -1 || 1 <= precision && precision <= maxPrecision,
    +        s"Invalid decimal precision: $typeName cannot store $precision 
digits (max $maxPrecision)")
    --- End diff --
    
    It doesn't hurt to double-check this, but this will be verified by the 
builder when the Parquet Type is constructed from file metadata.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-6777] [SQL] Implements backwards compat...

Reply via email to