Github user jackylk commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/1470#discussion_r149078206
--- Diff:
streaming/src/main/scala/org/apache/spark/sql/execution/streaming/CarbonAppendableStreamSink.scala
---
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.apache.spark.internal.io.FileCommitProtocol
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+import org.apache.carbondata.common.logging.LogServiceFactory
+import org.apache.carbondata.core.datastore.impl.FileFactory
+import org.apache.carbondata.core.dictionary.server.DictionaryServer
+import org.apache.carbondata.core.metadata.schema.table.CarbonTable
+import org.apache.carbondata.core.util.path.CarbonStorePath
+import org.apache.carbondata.hadoop.streaming.CarbonStreamOutputFormat
+import org.apache.carbondata.processing.loading.model.CarbonLoadModel
+import org.apache.carbondata.streaming.segment.StreamSegmentManager
+
+class CarbonAppendableStreamSink(
+ sparkSession: SparkSession,
+ val carbonTable: CarbonTable,
+ var currentSegmentId: String,
+ parameters: Map[String, String],
+ carbonLoadModel: CarbonLoadModel,
+ sever: Option[DictionaryServer]) extends Sink {
+
+ private val LOGGER =
LogServiceFactory.getLogService(this.getClass.getCanonicalName)
+ private val carbonTablePath = CarbonStorePath
+ .getCarbonTablePath(carbonTable.getAbsoluteTableIdentifier)
+ private val fileLogPath = carbonTablePath.getStreamingLogDir
+ private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION,
sparkSession, fileLogPath)
+ // prepare configuration
+ private val hadoopConf = {
+ val conf = sparkSession.sessionState.newHadoopConf()
+ CarbonStreamOutputFormat.setCarbonLoadModel(conf, carbonLoadModel)
+ // put all parameters into hadoopConf
+ parameters.foreach { entry =>
+ conf.set(entry._1, entry._2)
+ }
+ conf
+ }
+
+ override def addBatch(batchId: Long, data: DataFrame): Unit = {
+ if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) {
+ LOGGER.info(s"Skipping already committed batch $batchId")
+ } else {
+ checkOrHandOffSegment()
+
+ val committer = FileCommitProtocol.instantiate(
+ className =
sparkSession.sessionState.conf.streamingFileCommitProtocolClass,
+ jobId = batchId.toString,
+ outputPath = fileLogPath,
+ isAppend = false)
+
+ committer match {
+ case manifestCommitter: ManifestFileCommitProtocol =>
+ manifestCommitter.setupManifestOptions(fileLog, batchId)
+ case _ => // Do nothing
+ }
+
+ CarbonStreamProcessor.writeDataFileJob(
+ sparkSession,
+ carbonTable,
+ parameters,
+ batchId,
+ currentSegmentId,
+ data.queryExecution,
+ committer,
+ hadoopConf,
+ sever)
+ }
+ }
+
+ // if the directory size of current segment beyond the threshold, hand
off new segment
+ private def checkOrHandOffSegment(): Unit = {
+ val segmentDir = carbonTablePath.getSegmentDir("0", currentSegmentId)
+ val fileType = FileFactory.getFileType(segmentDir)
+ if (StreamSegmentManager.STREAM_SEGMENT_MAX_SIZE <=
FileFactory.getDirectorySize(segmentDir)) {
--- End diff --
Can we make use of metadata instead of checking file system for every batch?
---