[GitHub] [spark] hvanhovell commented on a diff in pull request #40256: [SPARK-42653][CONNECT] Artifact transfer from Scala/JVM client to Server

via GitHub Thu, 02 Mar 2023 11:09:25 -0800


hvanhovell commented on code in PR #40256:
URL: https://github.com/apache/spark/pull/40256#discussion_r1123590310



##########
connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala:
##########
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.connect.client
+
+import java.io.{ByteArrayInputStream, InputStream}
+import java.net.URI
+import java.nio.file.{Files, Path, Paths}
+import java.util.concurrent.CopyOnWriteArrayList
+import java.util.zip.{CheckedInputStream, CRC32}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.concurrent.Promise
+import scala.concurrent.duration.Duration
+import scala.util.control.NonFatal
+
+import Artifact._
+import com.google.protobuf.ByteString
+import io.grpc.ManagedChannel
+import io.grpc.stub.StreamObserver
+
+import org.apache.spark.connect.proto
+import org.apache.spark.connect.proto.AddArtifactsResponse
+import org.apache.spark.connect.proto.AddArtifactsResponse.ArtifactSummary
+import org.apache.spark.util.{ThreadUtils, Utils}
+
+/**
+ * The Artifact Manager is responsible for handling and transferring artifacts 
from the local
+ * client to the server (local/remote).
+ * @param userContext
+ * @param channel
+ */
+class ArtifactManager(userContext: proto.UserContext, channel: ManagedChannel) 
{
+  // Using the midpoint recommendation of 32KiB for chunk size as specified in
+  // https://github.com/grpc/grpc.github.io/issues/371.
+  private val CHUNK_SIZE: Int = 32 * 1024
+
+  private[this] val stub = proto.SparkConnectServiceGrpc.newStub(channel)
+  private[this] val classFinders = new CopyOnWriteArrayList[ClassFinder]
+
+  /**
+   * Register a [[ClassFinder]] for dynamically generated classes.
+   */
+  def register(finder: ClassFinder): Unit = classFinders.add(finder)
+
+  /**
+   * Add a single artifact to the session.
+   *
+   * Currently only local files with extensions .jar and .class are supported.
+   */
+  def addArtifact(path: String): Unit = {
+    addArtifact(Utils.resolveURI(path))
+  }
+
+  private def parseArtifacts(uri: URI): Seq[Artifact] = {
+    // Currently only local files with extensions .jar and .class are 
supported.
+    uri.getScheme match {
+      case "file" =>
+        val path = Paths.get(uri)
+        val artifact = path.getFileName.toString match {
+          case jar if jar.endsWith(".jar") =>
+            newJarArtifact(path.getFileName, new LocalFile(path))
+          case cf if cf.endsWith(".class") =>
+            newClassArtifact(path.getFileName, new LocalFile(path))
+          case other =>
+            throw new UnsupportedOperationException(s"Unsuppoted file format: 
$other")
+        }
+        Seq[Artifact](artifact)
+
+      case other =>
+        throw new UnsupportedOperationException(s"Unsupported scheme: $other")
+    }
+  }
+
+  /**
+   * Add a single artifact to the session.
+   *
+   * Currently only local files with extensions .jar and .class are supported.
+   */
+  def addArtifact(uri: URI): Unit = addArtifacts(parseArtifacts(uri))
+
+  /**
+   * Ensure that all classfile artifacts have been uploaded to the server, and 
are ready for use.
+   */
+  private[client] def ensureAllClassFileArtifactsUploaded(): Unit = {
+    addArtifacts(classFinders.asScala.flatMap(_.findClasses()))
+  }
+
+  /**
+   * Add multiple artifacts to the session.
+   *
+   * Currently only local files with extensions .jar and .class are supported.
+   */
+  def addArtifacts(uris: Seq[URI]): Unit = 
addArtifacts(uris.flatMap(parseArtifacts))
+
+  /**
+   * Add a number of artifacts to the session.
+   */
+  private def addArtifacts(artifacts: Iterable[Artifact]): Unit = {
+    val promise = Promise[Seq[ArtifactSummary]]
+    val responseHandler = new StreamObserver[proto.AddArtifactsResponse] {
+      private val summaries = mutable.Buffer.empty[ArtifactSummary]
+      override def onNext(v: AddArtifactsResponse): Unit = {
+        v.getArtifactsList.forEach { summary =>
+          summaries += summary
+        }
+      }
+      override def onError(throwable: Throwable): Unit = {
+        promise.failure(throwable)
+      }
+      override def onCompleted(): Unit = {
+        promise.success(summaries.toSeq)
+      }
+    }
+    val stream = stub.addArtifacts(responseHandler)
+    val currentBatch = mutable.Buffer.empty[Artifact]
+    var currentBatchSize = 0L
+
+    def addToBatch(dep: Artifact, size: Long): Unit = {
+      currentBatch += dep
+      currentBatchSize += size
+    }
+
+    def writeBatch(): Unit = {
+      addBatchedArtifacts(currentBatch.toSeq, stream)
+      currentBatch.clear()
+      currentBatchSize = 0
+    }
+
+    artifacts.iterator.foreach { artifact =>
+      val data = artifact.storage.asInstanceOf[LocalData]
+      val size = data.size
+      if (size > CHUNK_SIZE) {
+        // Payload can either be a batch OR a single chunked artifact. Write 
batch if non-empty
+        // before chunking current artifact.
+        if (currentBatch.nonEmpty) {
+          writeBatch()
+        }
+        addChunkedArtifact(artifact, stream)
+      } else {
+        if (currentBatchSize + size > CHUNK_SIZE) {
+          writeBatch()
+        }
+        addToBatch(artifact, size)
+      }
+    }
+    if (currentBatch.nonEmpty) {
+      writeBatch()
+    }
+    stream.onCompleted()
+    ThreadUtils.awaitResult(promise.future, Duration.Inf)
+    // TODO: Handle responses containing CRC failures.
+  }
+
+  /**
+   * Add a batch of artifacts to the stream. All the artifacts in this call 
are packaged into a
+   * single [[proto.AddArtifactsRequest]].
+   */
+  private def addBatchedArtifacts(
+      artifacts: Seq[Artifact],
+      stream: StreamObserver[proto.AddArtifactsRequest]): Unit = {
+    val builder = proto.AddArtifactsRequest
+      .newBuilder()
+      .setUserContext(userContext)
+    artifacts.foreach { artifact =>
+      val in = new 
CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32)
+      try {
+        val data = proto.AddArtifactsRequest.ArtifactChunk
+          .newBuilder()
+          .setData(ByteString.readFrom(in))
+          .setCrc(in.getChecksum.getValue)
+
+        builder.getBatchBuilder
+          .addArtifactsBuilder()
+          .setName(artifact.path.toString)
+          .setData(data)
+          .build()
+      } catch {
+        case NonFatal(e) =>
+          stream.onError(e)
+          throw e
+      } finally {
+        in.close()
+      }
+    }
+    stream.onNext(builder.build())
+  }
+
+  /**
+   * Read data from an [[InputStream]] in pieces of `chunkSize` bytes and 
convert to
+   * protobuf-compatible [[ByteString]].
+   * @param in
+   * @return
+   */
+  private def readNextChunk(in: InputStream): ByteString = {
+    val buf = new Array[Byte](CHUNK_SIZE)
+    var bytesRead = 0
+    var count = 0
+    while (count != -1 && bytesRead < CHUNK_SIZE) {
+      count = in.read(buf, bytesRead, CHUNK_SIZE - bytesRead)
+      if (count != -1) {
+        bytesRead += count
+      }
+    }
+    if (bytesRead == 0) ByteString.empty()
+    else ByteString.copyFrom(buf, 0, bytesRead)
+  }
+
+  /**
+   * Add a artifact in chunks to the stream. The artifact's data is spread out 
over multiple
+   * [[proto.AddArtifactsRequest requests]].
+   */
+  private def addChunkedArtifact(
+      artifact: Artifact,
+      stream: StreamObserver[proto.AddArtifactsRequest]): Unit = {
+    val builder = proto.AddArtifactsRequest
+      .newBuilder()
+      .setUserContext(userContext)
+
+    val in = new 
CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32)
+    try {
+      // First RPC contains the `BeginChunkedArtifact` payload (`begin_chunk`).
+      // Subsequent RPCs contains the `ArtifactChunk` payload (`chunk`).
+      val artifactChunkBuilder = 
proto.AddArtifactsRequest.ArtifactChunk.newBuilder()
+      var dataChunk = readNextChunk(in)
+      // Integer division that rounds up to the nearest whole number.
+      def getNumChunks(size: Long): Long = (size + (CHUNK_SIZE - 1)) / 
CHUNK_SIZE
+
+      builder.getBeginChunkBuilder
+        .setName(artifact.path.toString)
+        .setTotalBytes(artifact.size)
+        .setNumChunks(getNumChunks(artifact.size))
+        .setInitialChunk(
+          artifactChunkBuilder
+            .setData(dataChunk)
+            .setCrc(in.getChecksum.getValue))
+      stream.onNext(builder.build())
+      builder.clearBeginChunk()
+
+      dataChunk = readNextChunk(in)
+      // Consume stream in chunks until there is no data left to read.
+      while (!dataChunk.isEmpty) {
+        artifactChunkBuilder.setData(dataChunk).setCrc(in.getChecksum.getValue)
+        builder.setChunk(artifactChunkBuilder.build())
+        stream.onNext(builder.build())
+
+        builder.clearChunk()
+        dataChunk = readNextChunk(in)
+      }
+    } catch {
+      case NonFatal(e) =>
+        stream.onError(e)
+        throw e
+    } finally {
+      in.close()
+    }
+  }
+}
+
+trait ClassFinder {
+  def findClasses(): Iterator[Artifact]

Review Comment:
   We should document this a bit better. For example is this method returning 
all REPL generated classes, or only the new ones?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] hvanhovell commented on a diff in pull request #40256: [SPARK-42653][CONNECT] Artifact transfer from Scala/JVM client to Server

Reply via email to