msamirkhan commented on a change in pull request #29354:
URL: https://github.com/apache/spark/pull/29354#discussion_r465994403
##########
File path:
external/avro/src/main/scala/org/apache/spark/sql/avro/SparkAvroDatumReader.scala
##########
@@ -452,71 +452,73 @@ class SparkAvroDatumReader[T](
private[this] def getArrayReader(avroType: Schema,
elementType: DataType,
- path: List[String],
- reuseObj: Boolean
+ path: List[String]
): (CatalystDataUpdater, Int, ResolvingDecoder) => Unit = {
val elementReader = newReader(avroType.getElementType, elementType, path,
false)
- val array = new ArrayBuffer[Any]
- val arrayUpdater = new ArrayBufferUpdater(array)
- val toArrayConverter = getToArrayDataConverter(elementType, array)
+ val arrayCreator = getArrayDataCreator(elementType)
+ val arrayExpander = getArrayDataExpander(elementType)
(updater, ordinal, in) => {
var length = in.readArrayStart()
- array.sizeHint(length.toInt)
+ var array = arrayCreator(length) // if (length == 0) 0 else 1)
+ val arrayUpdater = new ArrayDataUpdater(array)
+ var base: Int = 0
while (length > 0) {
var i = 0
while (i < length) {
- elementReader(arrayUpdater, i, in)
+ elementReader(arrayUpdater, base + i, in)
i += 1
+ // array = arrayExpander(arrayUpdater, if (i == length) 0 else 1)
}
+ base += length.toInt
length = in.arrayNext()
- array.sizeHint((array.length + length).toInt)
+ array = arrayExpander(arrayUpdater, length) // if (length == 0) 0 else
1)
}
- updater.set(ordinal, toArrayConverter())
- array.clear()
+ updater.set(ordinal, array)
}
}
Review comment:
In this commit arrays are read directly to ArrayData. This requires the
ability to "expand" GenericArrayData and UnsafeArrayData. Added one method to
UnsafeArrayData
https://github.com/apache/spark/pull/29354/commits/2c62ac9ae960582058e96860002f7768eebb95f2#r465994024
Read time improvements can be found in the benchmarks pdf on pg 2 under
column G.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]