[GitHub] spark pull request #20096: [SPARK-22908] Add kafka source and sink for conti...

tdas Fri, 05 Jan 2018 16:36:34 -0800

Github user tdas commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20096#discussion_r160006676
  
    --- Diff: 
external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
 ---
    @@ -0,0 +1,113 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.sql.kafka010
    +
    +import org.apache.kafka.clients.producer.{Callback, ProducerRecord, 
RecordMetadata}
    +
    +import org.apache.spark.internal.Logging
    +import org.apache.spark.sql.{Row, SparkSession}
    +import org.apache.spark.sql.catalyst.InternalRow
    +import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, 
Literal, UnsafeProjection}
    +import 
org.apache.spark.sql.kafka010.KafkaSourceProvider.{kafkaParamsForProducer, 
TOPIC_OPTION_KEY}
    +import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
    +import org.apache.spark.sql.sources.v2.writer._
    +import org.apache.spark.sql.streaming.OutputMode
    +import org.apache.spark.sql.types.{BinaryType, StringType, StructType}
    +
    +/**
    + * Dummy commit message. The DataSourceV2 framework requires a commit 
message implementation but we
    + * don't need to really send one.
    + */
    +case object KafkaWriterCommitMessage extends WriterCommitMessage
    +
    +/**
    + * A [[ContinuousWriter]] for Kafka writing. Responsible for generating 
the writer factory.
    + * @param topic The topic this writer is responsible for. If None, topic 
will be inferred from
    + *              a `topic` field in the incoming data.
    + * @param producerParams Parameters for Kafka producers in each task.
    + * @param schema The schema of the input data.
    + */
    +class KafkaContinuousWriter(
    +    topic: Option[String], producerParams: Map[String, String], schema: 
StructType)
    +  extends ContinuousWriter with SupportsWriteInternalRow {
    +
    +  override def createInternalRowWriterFactory(): 
KafkaContinuousWriterFactory =
    +    KafkaContinuousWriterFactory(topic, producerParams, schema)
    +
    +  override def commit(epochId: Long, messages: 
Array[WriterCommitMessage]): Unit = {}
    +  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
    +}
    +
    +/**
    + * A [[DataWriterFactory]] for Kafka writing. Will be serialized and sent 
to executors to generate
    + * the per-task data writers.
    + * @param topic The topic that should be written to. If None, topic will 
be inferred from
    + *              a `topic` field in the incoming data.
    + * @param producerParams Parameters for Kafka producers in each task.
    + * @param schema The schema of the input data.
    + */
    +case class KafkaContinuousWriterFactory(
    +    topic: Option[String], producerParams: Map[String, String], schema: 
StructType)
    +  extends DataWriterFactory[InternalRow] {
    +
    +  override def createDataWriter(partitionId: Int, attemptNumber: Int): 
DataWriter[InternalRow] = {
    +    new KafkaContinuousDataWriter(topic, producerParams, 
schema.toAttributes)
    +  }
    +}
    +
    +/**
    + * A [[DataWriter]] for Kafka writing. One data writer will be created in 
each partition to
    + * process incoming rows.
    + *
    + * @param targetTopic The topic that this data writer is targeting. If 
None, topic will be inferred
    + *                    from a `topic` field in the incoming data.
    + * @param producerParams Parameters to use for the Kafka producer.
    + * @param inputSchema The attributes in the input data.
    + */
    +class KafkaContinuousDataWriter(
    +    targetTopic: Option[String], producerParams: Map[String, String], 
inputSchema: Seq[Attribute])
    +  extends KafkaRowWriter(inputSchema, targetTopic) with 
DataWriter[InternalRow] {
    +  import scala.collection.JavaConverters._
    +
    +  private lazy val producer = CachedKafkaProducer.getOrCreate(
    +    new java.util.HashMap[String, Object](producerParams.asJava))
    +
    +  def write(row: InternalRow): Unit = {
    +    checkForErrors()
    +    sendRow(row, producer)
    +  }
    +
    +  def commit(): WriterCommitMessage = {
    +    // Send is asynchronous, but we can't commit until all rows are 
actually in Kafka.
    +    // This requires flushing and then checking that no callbacks produced 
errors.
    +    producer.flush()
    --- End diff --
    
    maybe check for errors first before doing a flush and flush can take a long 
time and its better to fail earlier if possible.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #20096: [SPARK-22908] Add kafka source and sink for conti...

Reply via email to