[GitHub] spark pull request: [SPARK-6776] [SPARK-8811] [SQL] Refactors Parq...

liancheng Mon, 06 Jul 2015 10:24:06 -0700

Github user liancheng commented on a diff in the pull request:

    https://github.com/apache/spark/pull/7231#discussion_r33958848
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystRowConverter.scala 
---
    @@ -0,0 +1,421 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.parquet
    +
    +import java.nio.ByteOrder
    +
    +import scala.collection.JavaConversions._
    +import scala.collection.mutable
    +import scala.collection.mutable.ArrayBuffer
    +
    +import org.apache.parquet.column.Dictionary
    +import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, 
PrimitiveConverter}
    +import org.apache.parquet.schema.Type.Repetition
    +import org.apache.parquet.schema.{GroupType, PrimitiveType, Type}
    +
    +import org.apache.spark.sql.Row
    +import org.apache.spark.sql.catalyst.expressions._
    +import org.apache.spark.sql.catalyst.util.DateTimeUtils
    +import org.apache.spark.sql.types._
    +import org.apache.spark.unsafe.types.UTF8String
    +
    +/**
    + * A [[ParentContainerUpdater]] is used by a Parquet converter to set 
converted values to some
    + * corresponding parent container. For example, a converter for a 
`StructType` field may set
    + * converted values to a [[MutableRow]]; or a converter for array elements 
may append converted
    + * values to an [[ArrayBuffer]].
    + */
    +private[parquet] trait ParentContainerUpdater {
    +  def set(value: Any): Unit = ()
    +  def setBoolean(value: Boolean): Unit = set(value)
    +  def setByte(value: Byte): Unit = set(value)
    +  def setShort(value: Short): Unit = set(value)
    +  def setInt(value: Int): Unit = set(value)
    +  def setLong(value: Long): Unit = set(value)
    +  def setFloat(value: Float): Unit = set(value)
    +  def setDouble(value: Double): Unit = set(value)
    +}
    +
    +/** A no-op updater used for root converter (who doesn't have a parent). */
    +private[parquet] object NoopUpdater extends ParentContainerUpdater
    +
    +/**
    + * A [[CatalystRowConverter]] is used to convert Parquet "structs" into 
Spark SQL [[Row]]s.  Since
    + * any Parquet record is also a struct, this converter can also be used as 
root converter.
    + *
    + * When used as a root converter, [[NoopUpdater]] should be used since 
root converters don't have
    + * any "parent" container.
    + *
    + * @param parquetType Parquet schema of Parquet records
    + * @param catalystType Spark SQL schema that corresponds to the Parquet 
record type
    + * @param updater An updater which propagates converted field values to 
the parent container
    + */
    +private[parquet] class CatalystRowConverter(
    +    parquetType: GroupType,
    +    catalystType: StructType,
    +    updater: ParentContainerUpdater)
    +  extends GroupConverter {
    +
    +  /**
    +   * Updater used together with field converters within a 
[[CatalystRowConverter]].  It propagates
    +   * converted filed values to the `ordinal`-th cell in `currentRow`.
    +   */
    +  private final class RowUpdater(row: MutableRow, ordinal: Int) extends 
ParentContainerUpdater {
    +    override def set(value: Any): Unit = row(ordinal) = value
    +    override def setBoolean(value: Boolean): Unit = 
row.setBoolean(ordinal, value)
    +    override def setByte(value: Byte): Unit = row.setByte(ordinal, value)
    +    override def setShort(value: Short): Unit = row.setShort(ordinal, 
value)
    +    override def setInt(value: Int): Unit = row.setInt(ordinal, value)
    +    override def setLong(value: Long): Unit = row.setLong(ordinal, value)
    +    override def setDouble(value: Double): Unit = row.setDouble(ordinal, 
value)
    +    override def setFloat(value: Float): Unit = row.setFloat(ordinal, 
value)
    +  }
    +
    +  /**
    +   * Represents the converted row object once an entire Parquet record is 
converted.
    +   *
    +   * @todo Uses [[UnsafeRow]] for better performance.
    +   */
    +  val currentRow = new SpecificMutableRow(catalystType.map(_.dataType))
    +
    +  // Converters for each field.
    +  private val fieldConverters: Array[Converter] = {
    +    parquetType.getFields.zip(catalystType).zipWithIndex.map {
    +      case ((parquetFieldType, catalystField), ordinal) =>
    +        // Converted field value should be set to the `ordinal`-th cell of 
`currentRow`
    +        newConverter(parquetFieldType, catalystField.dataType, new 
RowUpdater(currentRow, ordinal))
    +    }.toArray
    +  }
    +
    +  override def getConverter(fieldIndex: Int): Converter = 
fieldConverters(fieldIndex)
    +
    +  override def end(): Unit = updater.set(currentRow)
    +
    +  override def start(): Unit = {
    +    var i = 0
    +    while (i < currentRow.length) {
    +      currentRow.setNullAt(i)
    +      i += 1
    +    }
    +  }
    +
    +  /**
    +   * Creates a converter for the given Parquet type `parquetType` and 
Spark SQL data type
    +   * `catalystType`. Converted values are handled by `updater`.
    +   */
    +  private def newConverter(
    +      parquetType: Type,
    +      catalystType: DataType,
    +      updater: ParentContainerUpdater): Converter = {
    +
    +    catalystType match {
    +      case BooleanType | IntegerType | LongType | FloatType | DoubleType | 
BinaryType =>
    +        new CatalystPrimitiveConverter(updater)
    +
    +      case ByteType =>
    +        new PrimitiveConverter {
    +          override def addInt(value: Int): Unit =
    +            updater.setByte(value.asInstanceOf[ByteType#InternalType])
    +        }
    +
    +      case ShortType =>
    +        new PrimitiveConverter {
    +          override def addInt(value: Int): Unit =
    +            updater.setShort(value.asInstanceOf[ShortType#InternalType])
    +        }
    +
    +      case t: DecimalType =>
    +        new CatalystDecimalConverter(t, updater)
    +
    +      case StringType =>
    +        new CatalystStringConverter(updater)
    +
    +      case TimestampType =>
    +        new PrimitiveConverter {
    +          override def addBinary(value: Binary): Unit = {
    +            assert(
    +              value.length() == 12,
    +              "Timestamps (with nanoseconds) are expected to be stored in 
12-byte long binaries, " +
    +              s"but got a ${value.length()}-byte binary.")
    +
    +            val buf = value.toByteBuffer.order(ByteOrder.LITTLE_ENDIAN)
    +            val timeOfDayNanos = buf.getLong
    +            val julianDay = buf.getInt
    +            updater.setLong(DateTimeUtils.fromJulianDay(julianDay, 
timeOfDayNanos))
    +          }
    +        }
    --- End diff --
    
    (Should probably add a TODO comment there.)



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-6776] [SPARK-8811] [SQL] Refactors Parq...

Reply via email to