[GitHub] [carbondata] marchpure commented on a change in pull request #3798: [CARBONDATA-3875] Support show segments with stage

2020-06-28 Thread GitBox


marchpure commented on a change in pull request #3798:
URL: https://github.com/apache/carbondata/pull/3798#discussion_r446657621



##
File path: 
integration/spark/src/main/scala/org/apache/carbondata/api/CarbonStore.scala
##
@@ -63,13 +68,101 @@ object CarbonStore {
 }
 
 if (limit.isDefined) {
-  val lim = Integer.parseInt(limit.get)
-  segmentsMetadataDetails.slice(0, lim)
+  segmentsMetadataDetails.slice(0, limit.get)
 } else {
   segmentsMetadataDetails
 }
   }
 
+  /**
+   * Read stage files and return input files
+   */
+  def readStages(
+  tablePath: String,
+  configuration: Configuration): Seq[StageInput] = {
+val stageFiles = listStageFiles(
+  CarbonTablePath.getStageDir(tablePath), configuration)
+var output = Collections.synchronizedList(new util.ArrayList[StageInput]())
+output.addAll(readStageInput(stageFiles._1,
+  StageInput.StageStatus.Unload).asJavaCollection)
+output.addAll(readStageInput(stageFiles._2,
+  StageInput.StageStatus.Loading).asJavaCollection)
+Collections.sort(output, new Comparator[StageInput]() {
+  def compare(stageInput1: StageInput, stageInput2: StageInput): Int = {
+(stageInput2.getCreateTime - stageInput1.getCreateTime).intValue()
+  }
+})
+output.asScala
+  }
+
+  /**
+   * Read stage files and return input files
+   */
+  def readStageInput(
+  stageFiles: Seq[CarbonFile],
+  status: StageInput.StageStatus): Seq[StageInput] = {
+val gson = new Gson()
+val output = Collections.synchronizedList(new util.ArrayList[StageInput]())
+stageFiles.map { stage =>
+  val filePath = stage.getAbsolutePath
+  val stream = FileFactory.getDataInputStream(filePath)
+  try {
+val stageInput = gson.fromJson(new InputStreamReader(stream), 
classOf[StageInput])
+stageInput.setCreateTime(stage.getLastModifiedTime)
+stageInput.setStatus(status)
+output.add(stageInput)
+  } finally {
+stream.close()
+  }
+}
+output.asScala
+  }
+
+  /*
+   * Collect all stage files and matched success files.
+   * A stage file without success file will not be collected
+   */
+  def listStageFiles(

Review comment:
   not only for test, it will be used in CarbonStore.scala

##
File path: docs/segment-management-on-carbondata.md
##
@@ -32,7 +32,7 @@ concept which helps to maintain consistency of data and easy 
transaction managem
 
   ```
   SHOW [HISTORY] SEGMENTS
-  [FOR TABLE | ON] [db_name.]table_name [LIMIT number_of_segments]
+  [FOR TABLE | ON] [db_name.]table_name [INCLUDE STAGE] [LIMIT 
number_of_segments]

Review comment:
   modified

##
File path: 
integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonShowSegmentsAsSelectCommand.scala
##
@@ -35,7 +36,8 @@ case class CarbonShowSegmentsAsSelectCommand(
 databaseNameOp: Option[String],
 tableName: String,
 query: String,
-limit: Option[String],
+limit: Option[Int],
+includeStage: Boolean = false,

Review comment:
   modified





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [carbondata] marchpure commented on a change in pull request #3798: [CARBONDATA-3875] Support show segments with stage

2020-06-28 Thread GitBox


marchpure commented on a change in pull request #3798:
URL: https://github.com/apache/carbondata/pull/3798#discussion_r446657569



##
File path: 
integration/spark/src/main/scala/org/apache/spark/sql/execution/command/management/CarbonShowSegmentsCommand.scala
##
@@ -17,34 +17,40 @@
 
 package org.apache.spark.sql.execution.command.management
 
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
 import org.apache.spark.sql.{CarbonEnv, Row, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, 
AttributeReference}
 import org.apache.spark.sql.execution.command.{Checker, DataCommand}
 import org.apache.spark.sql.types.StringType
 
-import org.apache.carbondata.api.CarbonStore.{getDataAndIndexSize, 
getLoadStartTime, getLoadTimeTaken, getPartitions, readSegments}
+import org.apache.carbondata.api.CarbonStore.{getDataAndIndexSize, 
getLoadStartTime, getLoadTimeTaken, getPartitions, readSegments, readStages}
 import org.apache.carbondata.common.Strings
 import 
org.apache.carbondata.common.exceptions.sql.MalformedCarbonCommandException
-import org.apache.carbondata.core.statusmanager.LoadMetadataDetails
+import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, 
StageInput}
+import org.apache.carbondata.core.util.path.CarbonTablePath
+
 
 case class CarbonShowSegmentsCommand(
 databaseNameOp: Option[String],
 tableName: String,
-limit: Option[String],
+limit: Option[Int],
+includeStage: Boolean = false,

Review comment:
   modified

##
File path: 
integration/spark/src/main/scala/org/apache/carbondata/api/CarbonStore.scala
##
@@ -63,13 +68,101 @@ object CarbonStore {
 }
 
 if (limit.isDefined) {
-  val lim = Integer.parseInt(limit.get)
-  segmentsMetadataDetails.slice(0, lim)
+  segmentsMetadataDetails.slice(0, limit.get)
 } else {
   segmentsMetadataDetails
 }
   }
 
+  /**
+   * Read stage files and return input files
+   */
+  def readStages(
+  tablePath: String,
+  configuration: Configuration): Seq[StageInput] = {
+val stageFiles = listStageFiles(
+  CarbonTablePath.getStageDir(tablePath), configuration)
+var output = Collections.synchronizedList(new util.ArrayList[StageInput]())
+output.addAll(readStageInput(stageFiles._1,
+  StageInput.StageStatus.Unload).asJavaCollection)
+output.addAll(readStageInput(stageFiles._2,
+  StageInput.StageStatus.Loading).asJavaCollection)
+Collections.sort(output, new Comparator[StageInput]() {
+  def compare(stageInput1: StageInput, stageInput2: StageInput): Int = {
+(stageInput2.getCreateTime - stageInput1.getCreateTime).intValue()
+  }
+})
+output.asScala
+  }
+
+  /**
+   * Read stage files and return input files
+   */
+  def readStageInput(
+  stageFiles: Seq[CarbonFile],
+  status: StageInput.StageStatus): Seq[StageInput] = {
+val gson = new Gson()
+val output = Collections.synchronizedList(new util.ArrayList[StageInput]())
+stageFiles.map { stage =>
+  val filePath = stage.getAbsolutePath
+  val stream = FileFactory.getDataInputStream(filePath)
+  try {
+val stageInput = gson.fromJson(new InputStreamReader(stream), 
classOf[StageInput])
+stageInput.setCreateTime(stage.getLastModifiedTime)
+stageInput.setStatus(status)
+output.add(stageInput)
+  } finally {
+stream.close()
+  }
+}
+output.asScala
+  }
+
+  /*
+   * Collect all stage files and matched success files.
+   * A stage file without success file will not be collected
+   */
+  def listStageFiles(
+loadDetailsDir: String,
+hadoopConf: Configuration): (Array[CarbonFile], Array[CarbonFile]) = {
+val dir = FileFactory.getCarbonFile(loadDetailsDir, hadoopConf)
+if (dir.exists()) {
+  var allFiles = dir.listFiles()
+  val successFiles = allFiles.filter { file =>
+file.getName.endsWith(CarbonTablePath.SUCCESS_FILE_SUBFIX)
+  }.map { file =>
+(file.getName.substring(0, file.getName.indexOf(".")), file)
+  }.toMap
+  val loadingFiles = allFiles.filter { file =>
+file.getName.endsWith(CarbonTablePath.LOADING_FILE_SUBFIX)
+  }.map { file =>
+(file.getName.substring(0, file.getName.indexOf(".")), file)
+  }.toMap
+
+  allFiles = allFiles.filter { file =>
+!file.getName.endsWith(CarbonTablePath.SUCCESS_FILE_SUBFIX)
+  }.filter { file =>
+!file.getName.endsWith(CarbonTablePath.LOADING_FILE_SUBFIX)
+  }
+
+  val unloadedFiles = allFiles.filter { file =>
+successFiles.contains(file.getName)

Review comment:
   modified

##
File path: 
integration/spark/src/main/scala/org/apache/carbondata/api/CarbonStore.scala
##
@@ -63,13 +68,101 @@ object CarbonStore {
 }
 
 if (limit.isDefined) {
-  val lim = Integer.parseInt(limit.get)
-