[GitHub] [spark] revans2 commented on a change in pull request #29067: [SPARK-32274][SQL] Make SQL cache serialization pluggable

GitBox Tue, 21 Jul 2020 07:57:08 -0700


revans2 commented on a change in pull request #29067:
URL: https://github.com/apache/spark/pull/29067#discussion_r458163165




##########
File path: 
sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
##########
@@ -85,77 +232,81 @@ case class CachedRDDBuilder(
   }
 
   private def buildBuffers(): RDD[CachedBatch] = {
-    val output = cachedPlan.output
-    val cached = cachedPlan.execute().mapPartitionsInternal { rowIterator =>
-      new Iterator[CachedBatch] {
-        def next(): CachedBatch = {
-          val columnBuilders = output.map { attribute =>
-            ColumnBuilder(attribute.dataType, batchSize, attribute.name, 
useCompression)
-          }.toArray
-
-          var rowCount = 0
-          var totalSize = 0L
-          while (rowIterator.hasNext && rowCount < batchSize
-            && totalSize < ColumnBuilder.MAX_BATCH_SIZE_IN_BYTE) {
-            val row = rowIterator.next()
-
-            // Added for SPARK-6082. This assertion can be useful for 
scenarios when something
-            // like Hive TRANSFORM is used. The external data generation 
script used in TRANSFORM
-            // may result malformed rows, causing 
ArrayIndexOutOfBoundsException, which is somewhat
-            // hard to decipher.
-            assert(
-              row.numFields == columnBuilders.length,
-              s"Row column number mismatch, expected ${output.size} columns, " 
+
-                s"but got ${row.numFields}." +
-                s"\nRow content: $row")
-
-            var i = 0
-            totalSize = 0
-            while (i < row.numFields) {
-              columnBuilders(i).appendFrom(row, i)
-              totalSize += columnBuilders(i).columnStats.sizeInBytes
-              i += 1
-            }
-            rowCount += 1
-          }
-
-          sizeInBytesStats.add(totalSize)
-          rowCountStats.add(rowCount)
-
-          val stats = InternalRow.fromSeq(
-            columnBuilders.flatMap(_.columnStats.collectedStatistics).toSeq)
-          CachedBatch(rowCount, columnBuilders.map { builder =>
-            JavaUtils.bufferToArray(builder.build())
-          }, stats)
-        }
-
-        def hasNext: Boolean = rowIterator.hasNext
-      }
+    val cb = if (cachedPlan.supportsColumnar) {
+      serializer.convertForCacheColumnar(cachedPlan.executeColumnar(),
+        cachedPlan.output,
+        storageLevel,
+        cachedPlan.conf)
+    } else {
+      serializer.convertForCache(cachedPlan.execute(),
+        cachedPlan.output,
+        storageLevel,
+        cachedPlan.conf)
+    }
+    val cached = cb.map { batch =>
+      sizeInBytesStats.add(batch.sizeInBytes)
+      rowCountStats.add(batch.numRows)
+      batch
     }.persist(storageLevel)
-
     cached.setName(cachedName)
     cached
   }
 }
 
 object InMemoryRelation {
 
+  private[this] var ser: Option[CachedBatchSerializer] = None
+  private[this] def getSerializer(sqlConf: SQLConf): CachedBatchSerializer = 
synchronized {
+    if (ser.isEmpty) {
+      val serName = sqlConf.getConf(StaticSQLConf.SPARK_CACHE_SERIALIZER)
+      val serClass = Utils.classForName(serName)
+      val instance = 
serClass.getConstructor().newInstance().asInstanceOf[CachedBatchSerializer]
+      ser = Some(instance)
+    }
+    ser.get
+  }
+
   def apply(
-      useCompression: Boolean,
-      batchSize: Int,
+      storageLevel: StorageLevel,
+      qe: QueryExecution,
+      tableName: Option[String]): InMemoryRelation = {
+    val optimizedPlan = qe.optimizedPlan
+    val serializer = getSerializer(optimizedPlan.conf)
+    val child = if (serializer.supportsColumnarInput(optimizedPlan.output)) {

Review comment:
       I picked row based input as a minimum requirement because the majority 
of the time the output of the plan will be row based. I also wanted to avoid 
unnecessary transitions from one data format to another. Each time one tries to 
change the format of the data, row to column, column to row, or even column of 
one type to column of another there is a cost involved. As a rule of thumb it 
is ideal to reduce those transformations when ever possible. So in that respect 
columnar input support is a performance optimization in the rare case that 
columnar input is a possibility.
   
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] revans2 commented on a change in pull request #29067: [SPARK-32274][SQL] Make SQL cache serialization pluggable

Reply via email to