[GitHub] spark pull request: [SPARK-10917] [SQL] improve performance of com...

liancheng Wed, 07 Oct 2015 11:54:05 -0700

Github user liancheng commented on a diff in the pull request:

    https://github.com/apache/spark/pull/8971#discussion_r41430343
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala ---
    @@ -69,176 +67,65 @@ class ColumnTypeSuite extends SparkFunSuite with 
Logging {
         checkActualSize(LONG, Long.MaxValue, 8)
         checkActualSize(FLOAT, Float.MaxValue, 4)
         checkActualSize(DOUBLE, Double.MaxValue, 8)
    -    checkActualSize(STRING, UTF8String.fromString("hello"), 4 + 
"hello".getBytes("utf-8").length)
    +    checkActualSize(STRING, "hello", 4 + "hello".getBytes("utf-8").length)
         checkActualSize(BINARY, Array.fill[Byte](4)(0.toByte), 4 + 4)
    -    checkActualSize(FIXED_DECIMAL(15, 10), Decimal(0, 15, 10), 8)
    -
    -    val generic = Map(1 -> "a")
    -    checkActualSize(MAP_GENERIC, SparkSqlSerializer.serialize(generic), 4 
+ 8)
    +    checkActualSize(COMPACT_DECIMAL(15, 10), Decimal(0, 15, 10), 8)
    +    checkActualSize(DECIMAL(20, 10), Decimal(0, 20, 10), 5)
    +    checkActualSize(ARRAY_TYPE, Array[Any](1), 16)
    +    checkActualSize(MAP_TYPE, Map(1 -> "a"), 25)
    +    checkActualSize(STRUCT_TYPE, Row("hello"), 28)
       }
     
    -  testNativeColumnType(BOOLEAN)(
    -    (buffer: ByteBuffer, v: Boolean) => {
    -      buffer.put((if (v) 1 else 0).toByte)
    -    },
    -    (buffer: ByteBuffer) => {
    -      buffer.get() == 1
    -    })
    -
    -  testNativeColumnType(BYTE)(_.put(_), _.get)
    -
    -  testNativeColumnType(SHORT)(_.putShort(_), _.getShort)
    -
    -  testNativeColumnType(INT)(_.putInt(_), _.getInt)
    -
    -  testNativeColumnType(LONG)(_.putLong(_), _.getLong)
    -
    -  testNativeColumnType(FLOAT)(_.putFloat(_), _.getFloat)
    -
    -  testNativeColumnType(DOUBLE)(_.putDouble(_), _.getDouble)
    -
    -  testNativeColumnType(FIXED_DECIMAL(15, 10))(
    -    (buffer: ByteBuffer, decimal: Decimal) => {
    -      buffer.putLong(decimal.toUnscaledLong)
    -    },
    -    (buffer: ByteBuffer) => {
    -      Decimal(buffer.getLong(), 15, 10)
    -    })
    -
    -
    -  testNativeColumnType(STRING)(
    -    (buffer: ByteBuffer, string: UTF8String) => {
    -      val bytes = string.getBytes
    -      buffer.putInt(bytes.length)
    -      buffer.put(bytes)
    -    },
    -    (buffer: ByteBuffer) => {
    -      val length = buffer.getInt()
    -      val bytes = new Array[Byte](length)
    -      buffer.get(bytes)
    -      UTF8String.fromBytes(bytes)
    -    })
    -
    -  testColumnType[Array[Byte]](
    -    BINARY,
    -    (buffer: ByteBuffer, bytes: Array[Byte]) => {
    -      buffer.putInt(bytes.length).put(bytes)
    -    },
    -    (buffer: ByteBuffer) => {
    -      val length = buffer.getInt()
    -      val bytes = new Array[Byte](length)
    -      buffer.get(bytes, 0, length)
    -      bytes
    -    })
    -
    -  test("GENERIC") {
    -    val buffer = ByteBuffer.allocate(512)
    -    val obj = Map(1 -> "spark", 2 -> "sql")
    -    val serializedObj = SparkSqlSerializer.serialize(obj)
    -
    -    MAP_GENERIC.append(SparkSqlSerializer.serialize(obj), buffer)
    -    buffer.rewind()
    -
    -    val length = buffer.getInt()
    -    assert(length === serializedObj.length)
    -
    -    assertResult(obj, "Deserialized object didn't equal to the original 
object") {
    -      val bytes = new Array[Byte](length)
    -      buffer.get(bytes, 0, length)
    -      SparkSqlSerializer.deserialize(bytes)
    -    }
    -
    -    buffer.rewind()
    -    buffer.putInt(serializedObj.length).put(serializedObj)
    -
    -    assertResult(obj, "Deserialized object didn't equal to the original 
object") {
    -      buffer.rewind()
    -      SparkSqlSerializer.deserialize(MAP_GENERIC.extract(buffer))
    -    }
    +  testNativeColumnType(BOOLEAN)
    +  testNativeColumnType(BYTE)
    +  testNativeColumnType(SHORT)
    +  testNativeColumnType(INT)
    +  testNativeColumnType(LONG)
    +  testNativeColumnType(FLOAT)
    +  testNativeColumnType(DOUBLE)
    +  testNativeColumnType(COMPACT_DECIMAL(15, 10))
    +  testNativeColumnType(STRING)
    +
    +  testColumnType(NULL)
    +  testColumnType(BINARY)
    +  testColumnType(DECIMAL(20, 10))
    +  testColumnType(STRUCT_TYPE)
    +  testColumnType(ARRAY_TYPE)
    +  testColumnType(MAP_TYPE)
    +
    +  def testNativeColumnType[T <: AtomicType](columnType: 
NativeColumnType[T]): Unit = {
    +    testColumnType[T#InternalType](columnType)
       }
     
    -  test("CUSTOM") {
    -    val conf = new SparkConf()
    -    conf.set("spark.kryo.registrator", 
"org.apache.spark.sql.columnar.Registrator")
    -    val serializer = new SparkSqlSerializer(conf).newInstance()
    -
    -    val buffer = ByteBuffer.allocate(512)
    -    val obj = CustomClass(Int.MaxValue, Long.MaxValue)
    -    val serializedObj = serializer.serialize(obj).array()
    -
    -    MAP_GENERIC.append(serializer.serialize(obj).array(), buffer)
    -    buffer.rewind()
    -
    -    val length = buffer.getInt
    -    assert(length === serializedObj.length)
    -    assert(13 == length) // id (1) + int (4) + long (8)
    -
    -    val genericSerializedObj = SparkSqlSerializer.serialize(obj)
    -    assert(length != genericSerializedObj.length)
    -    assert(length < genericSerializedObj.length)
    -
    -    assertResult(obj, "Custom deserialized object didn't equal the 
original object") {
    -      val bytes = new Array[Byte](length)
    -      buffer.get(bytes, 0, length)
    -      serializer.deserialize(ByteBuffer.wrap(bytes))
    -    }
    -
    -    buffer.rewind()
    -    buffer.putInt(serializedObj.length).put(serializedObj)
    -
    -    assertResult(obj, "Custom deserialized object didn't equal the 
original object") {
    -      buffer.rewind()
    -      serializer.deserialize(ByteBuffer.wrap(MAP_GENERIC.extract(buffer)))
    -    }
    -  }
    -
    -  def testNativeColumnType[T <: AtomicType](
    -      columnType: NativeColumnType[T])
    -      (putter: (ByteBuffer, T#InternalType) => Unit,
    -      getter: (ByteBuffer) => T#InternalType): Unit = {
    -
    -    testColumnType[T#InternalType](columnType, putter, getter)
    -  }
    -
    -  def testColumnType[JvmType](
    -      columnType: ColumnType[JvmType],
    -      putter: (ByteBuffer, JvmType) => Unit,
    -      getter: (ByteBuffer) => JvmType): Unit = {
    +  def testColumnType[JvmType](columnType: ColumnType[JvmType]): Unit = {
     
         val buffer = ByteBuffer.allocate(DEFAULT_BUFFER_SIZE)
         val seq = (0 until 4).map(_ => makeRandomValue(columnType))
    +    val converter = 
CatalystTypeConverters.createToScalaConverter(columnType.dataType)
     
    -    test(s"$columnType.extract") {
    +    test(s"$columnType append/extract") {
           buffer.rewind()
    -      seq.foreach(putter(buffer, _))
    +      seq.foreach(columnType.append(_, buffer))
     
           buffer.rewind()
           seq.foreach { expected =>
             logInfo("buffer = " + buffer + ", expected = " + expected)
             val extracted = columnType.extract(buffer)
             assert(
    -          expected === extracted,
    +          converter(expected) === converter(extracted),
               "Extracted value didn't equal to the original one. " +
                 hexDump(expected) + " != " + hexDump(extracted) +
                 ", buffer = " + 
dumpBuffer(buffer.duplicate().rewind().asInstanceOf[ByteBuffer]))
           }
         }
    -
    -    test(s"$columnType.append") {
    -      buffer.rewind()
    -      seq.foreach(columnType.append(_, buffer))
    -
    -      buffer.rewind()
    -      seq.foreach { expected =>
    -        assert(
    -          expected === getter(buffer),
    -          "Extracted value didn't equal to the original one")
    -      }
    -    }
       }
     
       private def hexDump(value: Any): String = {
    -    value.toString.map(ch => Integer.toHexString(ch & 0xffff)).mkString(" 
")
    +    if (value == null) {
    +      ""
    +    } else {
    +      value.toString.map(ch => Integer.toHexString(ch & 
0xffff)).mkString(" ")
    +    }
    --- End diff --
    
    Are we dumping all types of data using hexdump here? Maybe just for 
binaries?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-10917] [SQL] improve performance of com...

Reply via email to