HyukjinKwon commented on a change in pull request #29353:
URL: https://github.com/apache/spark/pull/29353#discussion_r465466392
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
##########
@@ -72,137 +74,191 @@ class OrcDeserializer(
/**
* Creates a writer to write ORC values to Catalyst data structure at the
given ordinal.
*/
- private def newWriter(
- dataType: DataType, updater: CatalystDataUpdater): (Int,
WritableComparable[_]) => Unit =
- dataType match {
- case NullType => (ordinal, _) =>
- updater.setNullAt(ordinal)
-
- case BooleanType => (ordinal, value) =>
- updater.setBoolean(ordinal, value.asInstanceOf[BooleanWritable].get)
-
- case ByteType => (ordinal, value) =>
- updater.setByte(ordinal, value.asInstanceOf[ByteWritable].get)
-
- case ShortType => (ordinal, value) =>
- updater.setShort(ordinal, value.asInstanceOf[ShortWritable].get)
-
- case IntegerType => (ordinal, value) =>
- updater.setInt(ordinal, value.asInstanceOf[IntWritable].get)
-
- case LongType => (ordinal, value) =>
- updater.setLong(ordinal, value.asInstanceOf[LongWritable].get)
-
- case FloatType => (ordinal, value) =>
- updater.setFloat(ordinal, value.asInstanceOf[FloatWritable].get)
-
- case DoubleType => (ordinal, value) =>
- updater.setDouble(ordinal, value.asInstanceOf[DoubleWritable].get)
-
- case StringType => (ordinal, value) =>
- updater.set(ordinal,
UTF8String.fromBytes(value.asInstanceOf[Text].copyBytes))
-
- case BinaryType => (ordinal, value) =>
- val binary = value.asInstanceOf[BytesWritable]
- val bytes = new Array[Byte](binary.getLength)
- System.arraycopy(binary.getBytes, 0, bytes, 0, binary.getLength)
- updater.set(ordinal, bytes)
-
- case DateType => (ordinal, value) =>
- updater.setInt(ordinal, OrcShimUtils.getGregorianDays(value))
-
- case TimestampType => (ordinal, value) =>
- updater.setLong(ordinal,
DateTimeUtils.fromJavaTimestamp(value.asInstanceOf[OrcTimestamp]))
-
- case DecimalType.Fixed(precision, scale) => (ordinal, value) =>
- val v = OrcShimUtils.getDecimal(value)
- v.changePrecision(precision, scale)
- updater.set(ordinal, v)
-
- case st: StructType => (ordinal, value) =>
- val result = new SpecificInternalRow(st)
- val fieldUpdater = new RowUpdater(result)
- val fieldConverters = st.map(_.dataType).map { dt =>
- newWriter(dt, fieldUpdater)
- }.toArray
- val orcStruct = value.asInstanceOf[OrcStruct]
-
- var i = 0
- while (i < st.length) {
- val value = orcStruct.getFieldValue(i)
- if (value == null) {
- result.setNullAt(i)
- } else {
- fieldConverters(i)(i, value)
+ private def newWriter(dataType: DataType, reuseObj: Boolean):
+ (CatalystDataUpdater, Int, WritableComparable[_]) => Unit = dataType match {
+ case NullType => (updater, ordinal, _) =>
+ updater.setNullAt(ordinal)
+
+ case BooleanType => (updater, ordinal, value) =>
+ updater.setBoolean(ordinal, value.asInstanceOf[BooleanWritable].get)
+
+ case ByteType => (updater, ordinal, value) =>
+ updater.setByte(ordinal, value.asInstanceOf[ByteWritable].get)
+
+ case ShortType => (updater, ordinal, value) =>
+ updater.setShort(ordinal, value.asInstanceOf[ShortWritable].get)
+
+ case IntegerType => (updater, ordinal, value) =>
+ updater.setInt(ordinal, value.asInstanceOf[IntWritable].get)
+
+ case LongType => (updater, ordinal, value) =>
+ updater.setLong(ordinal, value.asInstanceOf[LongWritable].get)
+
+ case FloatType => (updater, ordinal, value) =>
+ updater.setFloat(ordinal, value.asInstanceOf[FloatWritable].get)
+
+ case DoubleType => (updater, ordinal, value) =>
+ updater.setDouble(ordinal, value.asInstanceOf[DoubleWritable].get)
+
+ case StringType => (updater, ordinal, value) =>
+ updater.set(ordinal,
UTF8String.fromBytes(value.asInstanceOf[Text].copyBytes))
+
+ case BinaryType => (updater, ordinal, value) =>
+ val binary = value.asInstanceOf[BytesWritable]
+ val bytes = new Array[Byte](binary.getLength)
+ System.arraycopy(binary.getBytes, 0, bytes, 0, binary.getLength)
+ updater.set(ordinal, bytes)
+
+ case DateType => (updater, ordinal, value) =>
+ updater.setInt(ordinal, OrcShimUtils.getGregorianDays(value))
+
+ case TimestampType => (updater, ordinal, value) =>
+ updater.setLong(ordinal,
DateTimeUtils.fromJavaTimestamp(value.asInstanceOf[OrcTimestamp]))
+
+ case DecimalType.Fixed(precision, scale) => (updater, ordinal, value) =>
+ val v = OrcShimUtils.getDecimal(value)
+ v.changePrecision(precision, scale)
+ updater.set(ordinal, v)
+
+ case st: StructType =>
+ val createRow: () => InternalRow = getRowCreator(st)
+ val fieldConverters = st.map(_.dataType).map { dt => newWriter(dt,
reuseObj)}.toArray
+ val baseConverter:
+ (InternalRow, RowUpdater, CatalystDataUpdater, Int,
WritableComparable[_]) => Unit = {
+ (row, rowUpdater, updater, ordinal, value) => {
+ val orcStruct = value.asInstanceOf[OrcStruct]
+
+ var i = 0
+ while (i < st.length) {
+ val value = orcStruct.getFieldValue(i)
+ if (value == null) {
+ row.setNullAt(i)
+ } else {
+ fieldConverters(i)(rowUpdater, i, value)
+ }
+ i += 1
}
- i += 1
+
+ updater.set(ordinal, row)
}
+ }
- updater.set(ordinal, result)
+ if (reuseObj) {
+ val row = createRow.apply()
+ val rowUpdater = new RowUpdater(row)
+ (updater, ordinal, value) => baseConverter.apply(row, rowUpdater,
updater, ordinal, value)
+ } else {
+ (updater, ordinal, value) => {
+ val row = createRow.apply()
+ val rowUpdater = new RowUpdater(row)
+ baseConverter.apply(row, rowUpdater, updater, ordinal, value)
+ }
+ }
- case ArrayType(elementType, _) => (ordinal, value) =>
+ case ArrayType(elementType, _) =>
+ val elementConverter = newWriter(elementType, false)
+ val createArray = getArrayDataCreator(elementType)
+ (updater, ordinal, value) => {
val orcArray = value.asInstanceOf[OrcList[WritableComparable[_]]]
val length = orcArray.size()
- val result = createArrayData(elementType, length)
+ val result = createArray(length)
val elementUpdater = new ArrayDataUpdater(result)
- val elementConverter = newWriter(elementType, elementUpdater)
var i = 0
while (i < length) {
val value = orcArray.get(i)
if (value == null) {
result.setNullAt(i)
} else {
- elementConverter(i, value)
+ elementConverter(elementUpdater, i, value)
}
i += 1
}
updater.set(ordinal, result)
+ }
- case MapType(keyType, valueType, _) => (ordinal, value) =>
+ case MapType(keyType, valueType, _) =>
+ val keyConverter = newWriter(keyType, false)
+ val createKeyArray = getArrayDataCreator(keyType)
+ val valueConverter = newWriter(valueType, false)
+ val createValueArray = getArrayDataCreator(valueType)
+ (updater, ordinal, value) => {
val orcMap = value.asInstanceOf[OrcMap[WritableComparable[_],
WritableComparable[_]]]
val length = orcMap.size()
- val keyArray = createArrayData(keyType, length)
+ val keyArray = createKeyArray(length)
val keyUpdater = new ArrayDataUpdater(keyArray)
- val keyConverter = newWriter(keyType, keyUpdater)
- val valueArray = createArrayData(valueType, length)
+ val valueArray = createValueArray(length)
val valueUpdater = new ArrayDataUpdater(valueArray)
- val valueConverter = newWriter(valueType, valueUpdater)
var i = 0
val it = orcMap.entrySet().iterator()
while (it.hasNext) {
val entry = it.next()
- keyConverter(i, entry.getKey)
+ keyConverter(keyUpdater, i, entry.getKey)
val value = entry.getValue
if (value == null) {
valueArray.setNullAt(i)
} else {
- valueConverter(i, value)
+ valueConverter(valueUpdater, i, value)
}
i += 1
}
// The ORC map will never have null or duplicated map keys, it's safe
to create a
// ArrayBasedMapData directly here.
updater.set(ordinal, new ArrayBasedMapData(keyArray, valueArray))
+ }
- case udt: UserDefinedType[_] => newWriter(udt.sqlType, updater)
+ case udt: UserDefinedType[_] =>
+ val converter = newWriter(udt.sqlType, reuseObj)
+ (updater, ordinal, value) => {
+ converter(updater, ordinal, value)
+ }
+
+ case _ =>
+ throw new UnsupportedOperationException(s"$dataType is not supported
yet.")
+ }
+
+ private def getArrayDataCreator(elementType: DataType): Int => ArrayData =
elementType match {
+ case BooleanType => length => UnsafeArrayData.createFreshArray(length, 1)
+ case ByteType => length => UnsafeArrayData.createFreshArray(length, 1)
+ case ShortType => length => UnsafeArrayData.createFreshArray(length, 2)
+ case IntegerType => length => UnsafeArrayData.createFreshArray(length, 4)
+ case LongType => length => UnsafeArrayData.createFreshArray(length, 8)
+ case FloatType => length => UnsafeArrayData.createFreshArray(length, 4)
+ case DoubleType => length => UnsafeArrayData.createFreshArray(length, 8)
+ case _ => length => new GenericArrayData(new Array[Any](length))
+ }
- case _ =>
- throw new UnsupportedOperationException(s"$dataType is not supported
yet.")
+ private def getRowCreator(st: StructType): () => InternalRow = {
+ val constructorsArray = new Array[() => MutableValue](st.fields.length)
+ var i = 0
+ while (i < st.fields.length) {
+ st.fields(i).dataType match {
+ case BooleanType => constructorsArray(i) = () => new MutableBoolean
+ case ByteType => constructorsArray(i) = () => new MutableByte
+ case ShortType => constructorsArray(i) = () => new MutableShort
+ // We use INT for DATE internally
+ case IntegerType | DateType => constructorsArray(i) = () => new
MutableInt
+ // We use Long for Timestamp internally
+ case LongType | TimestampType => constructorsArray(i) = () => new
MutableLong
+ case FloatType => constructorsArray(i) = () => new MutableFloat
+ case DoubleType => constructorsArray(i) = () => new MutableDouble
+ case _ => constructorsArray(i) = () => new MutableAny
+ }
+ i += 1
}
- private def createArrayData(elementType: DataType, length: Int): ArrayData =
elementType match {
Review comment:
Here as well. If we can get the length from the type as you did
`getArrayDataCreator`, looks we can just set it here and pull the
`createArrayData` logic out of the function so it creates once. Presumably,
same as `MapType`.
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
##########
@@ -72,137 +74,191 @@ class OrcDeserializer(
/**
* Creates a writer to write ORC values to Catalyst data structure at the
given ordinal.
*/
- private def newWriter(
- dataType: DataType, updater: CatalystDataUpdater): (Int,
WritableComparable[_]) => Unit =
- dataType match {
- case NullType => (ordinal, _) =>
- updater.setNullAt(ordinal)
-
- case BooleanType => (ordinal, value) =>
- updater.setBoolean(ordinal, value.asInstanceOf[BooleanWritable].get)
-
- case ByteType => (ordinal, value) =>
- updater.setByte(ordinal, value.asInstanceOf[ByteWritable].get)
-
- case ShortType => (ordinal, value) =>
- updater.setShort(ordinal, value.asInstanceOf[ShortWritable].get)
-
- case IntegerType => (ordinal, value) =>
- updater.setInt(ordinal, value.asInstanceOf[IntWritable].get)
-
- case LongType => (ordinal, value) =>
- updater.setLong(ordinal, value.asInstanceOf[LongWritable].get)
-
- case FloatType => (ordinal, value) =>
- updater.setFloat(ordinal, value.asInstanceOf[FloatWritable].get)
-
- case DoubleType => (ordinal, value) =>
- updater.setDouble(ordinal, value.asInstanceOf[DoubleWritable].get)
-
- case StringType => (ordinal, value) =>
- updater.set(ordinal,
UTF8String.fromBytes(value.asInstanceOf[Text].copyBytes))
-
- case BinaryType => (ordinal, value) =>
- val binary = value.asInstanceOf[BytesWritable]
- val bytes = new Array[Byte](binary.getLength)
- System.arraycopy(binary.getBytes, 0, bytes, 0, binary.getLength)
- updater.set(ordinal, bytes)
-
- case DateType => (ordinal, value) =>
- updater.setInt(ordinal, OrcShimUtils.getGregorianDays(value))
-
- case TimestampType => (ordinal, value) =>
- updater.setLong(ordinal,
DateTimeUtils.fromJavaTimestamp(value.asInstanceOf[OrcTimestamp]))
-
- case DecimalType.Fixed(precision, scale) => (ordinal, value) =>
- val v = OrcShimUtils.getDecimal(value)
- v.changePrecision(precision, scale)
- updater.set(ordinal, v)
-
- case st: StructType => (ordinal, value) =>
- val result = new SpecificInternalRow(st)
- val fieldUpdater = new RowUpdater(result)
- val fieldConverters = st.map(_.dataType).map { dt =>
- newWriter(dt, fieldUpdater)
Review comment:
So the problem was that this creates the field writer for each record.
Why don't we just simply pull this logic itself out of the function so it
creates once?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]