Github user joseph-torres commented on a diff in the pull request:
https://github.com/apache/spark/pull/19925#discussion_r156740666
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memoryV2.scala
---
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import javax.annotation.concurrent.GuardedBy
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.util.control.NonFatal
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Row
+import
org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete,
Update}
+import org.apache.spark.sql.sources.v2.{ContinuousWriteSupport,
DataSourceV2, DataSourceV2Options, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A sink that stores the results in memory. This [[Sink]] is primarily
intended for use in unit
+ * tests and does not provide durability.
+ */
+class MemorySinkV2 extends DataSourceV2
+ with MicroBatchWriteSupport with ContinuousWriteSupport with Logging {
+
+ override def createMicroBatchWriter(
+ queryId: String,
+ batchId: Long,
+ schema: StructType,
+ mode: OutputMode,
+ options: DataSourceV2Options):
java.util.Optional[DataSourceV2Writer] = {
+ java.util.Optional.of(new MemoryWriter(this, batchId, mode))
+ }
+
+ override def createContinuousWriter(
+ queryId: String,
+ schema: StructType,
+ mode: OutputMode,
+ options: DataSourceV2Options): java.util.Optional[ContinuousWriter]
= {
+ java.util.Optional.of(new ContinuousMemoryWriter(this, mode))
+ }
+
+ private case class AddedData(batchId: Long, data: Array[Row])
+
+ /** An order list of batches that have been written to this [[Sink]]. */
+ @GuardedBy("this")
+ private val batches = new ArrayBuffer[AddedData]()
+
+ /** Returns all rows that are stored in this [[Sink]]. */
+ def allData: Seq[Row] = synchronized {
+ batches.flatMap(_.data)
+ }
+
+ def latestBatchId: Option[Long] = synchronized {
+ batches.lastOption.map(_.batchId)
+ }
+
+ def latestBatchData: Seq[Row] = synchronized {
+ batches.lastOption.toSeq.flatten(_.data)
+ }
+
+ def toDebugString: String = synchronized {
+ batches.map { case AddedData(batchId, data) =>
+ val dataStr = try data.mkString(" ") catch {
+ case NonFatal(e) => "[Error converting to string]"
+ }
+ s"$batchId: $dataStr"
+ }.mkString("\n")
+ }
+
+ def write(batchId: Long, outputMode: OutputMode, newRows: Array[Row]):
Unit = {
+ val notCommitted = synchronized {
+ latestBatchId.isEmpty || batchId > latestBatchId.get
+ }
+ if (notCommitted) {
+ logDebug(s"Committing batch $batchId to $this")
+ outputMode match {
+ case Append | Update =>
+ val rows = AddedData(batchId, newRows)
+ synchronized { batches += rows }
+
+ case Complete =>
+ val rows = AddedData(batchId, newRows)
+ synchronized {
+ batches.clear()
+ batches += rows
+ }
+
+ case _ =>
+ throw new IllegalArgumentException(
+ s"Output mode $outputMode is not supported by MemorySink")
+ }
+ } else {
+ logDebug(s"Skipping already committed batch: $batchId")
+ }
+ }
+
+ def clear(): Unit = synchronized {
+ batches.clear()
+ }
+
+ override def toString(): String = "MemorySink"
+}
+
+case class MemoryWriterCommitMessage(partition: Int, data: Seq[Row])
extends WriterCommitMessage {}
+
+class MemoryWriter(sink: MemorySinkV2, batchId: Long, outputMode:
OutputMode)
+ extends DataSourceV2Writer with Logging {
+
+ override def createWriterFactory: MemoryWriterFactory =
MemoryWriterFactory(outputMode)
+
+ def commit(messages: Array[WriterCommitMessage]): Unit = {
+ val newRows = messages.flatMap { message =>
+ // TODO remove
+ if (message != null) {
+ assert(message.isInstanceOf[MemoryWriterCommitMessage])
+ message.asInstanceOf[MemoryWriterCommitMessage].data
+ } else {
+ Seq()
+ }
+ }
+ sink.write(batchId, outputMode, newRows)
+ }
+
+ override def abort(messages: Array[WriterCommitMessage]): Unit = {
+ // Don't accept any of the new input.
+ }
+}
+
+class ContinuousMemoryWriter(val sink: MemorySinkV2, outputMode:
OutputMode)
+ extends ContinuousWriter {
+
+ override def createWriterFactory: MemoryWriterFactory =
MemoryWriterFactory(outputMode)
+
+ override def commit(epochId: Long, messages:
Array[WriterCommitMessage]): Unit = {
+ val newRows = messages.flatMap {
+ case message: MemoryWriterCommitMessage => message.data
+ case _ => Seq()
--- End diff --
I removed the explicit mapping. I don't want to set the expectation that
every data source implementation needs to check and throw exceptions here; it's
Spark's responsibility to pass in the right type of message.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]