[
https://issues.apache.org/jira/browse/CARBONDATA-35?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15361103#comment-15361103
]
ASF GitHub Bot commented on CARBONDATA-35:
------------------------------------------
Github user QiangCai commented on a diff in the pull request:
https://github.com/apache/incubator-carbondata/pull/16#discussion_r69433879
--- Diff:
integration/spark/src/main/scala/org/carbondata/spark/rdd/CarbonGlobalDictionaryRDD.scala
---
@@ -321,3 +327,108 @@ class CarbonGlobalDictionaryGenerateRDD(
iter
}
}
+
+/**
+ * Set column dictionry patition format
+ *
+ * @param id partition id
+ * @param dimension current carbon dimension
+ */
+class CarbonColumnDictPatition(id: Int, dimension: CarbonDimension)
+ extends Partition {
+ override val index: Int = id
+ val preDefDictDimension = dimension
+}
+
+/**
+ * Use external column dict to generate global dictionary
+ *
+ * @param carbonLoadModel carbon load model
+ * @param sparkContext spark context
+ * @param table carbon table identifier
+ * @param dimensions carbon dimenisons having predefined dict
+ * @param hdfsLocation carbon base store path
+ * @param dictFolderPath path of dictionary folder
+ */
+class CarbonColumnDictGenerateRDD(carbonLoadModel: CarbonLoadModel,
+ sparkContext: SparkContext,
+ table: CarbonTableIdentifier,
+ dimensions: Array[CarbonDimension],
+ hdfsLocation: String,
+ dictFolderPath: String)
+ extends RDD[(Int, ColumnDistinctValues)](sparkContext, Nil) with Logging
{
+
+ override def getPartitions: Array[Partition] = {
+ val primDimensionsBuffer = new ArrayBuffer[CarbonDimension]
+ for (dimension <- dimensions) {
+ val dims = getPrimDimensionWithDict(carbonLoadModel, dimension, true)
+ primDimensionsBuffer ++= dims
+ }
+ val primDimensions = primDimensionsBuffer.toArray
+ val primDimLength = primDimensions.length
+ val result = new Array[Partition](primDimLength)
+ var primColStarIndex = 0
+ for (i <- 0 until primDimLength) {
+ result(i) = new CarbonColumnDictPatition(i, primDimensions(i))
+ }
+ result
+ }
+
+ override def compute(split: Partition, context: TaskContext)
+ : Iterator[(Int, ColumnDistinctValues)] = {
+ val theSplit = split.asInstanceOf[CarbonColumnDictPatition]
+ val primDimension = theSplit.preDefDictDimension
+ // read the column dict data
+ val preDefDictFilePath =
carbonLoadModel.getPredefDictFilePath(primDimension)
+ var csvReader: CSVReader = null
+ var inputStream: DataInputStream = null
+ var colDictData: java.util.Iterator[Array[String]] = null
+ try {
+ inputStream = FileFactory.getDataInputStream(preDefDictFilePath,
+ FileFactory.getFileType(preDefDictFilePath))
+ csvReader = new CSVReader(new InputStreamReader(inputStream,
Charset.defaultCharset),
--- End diff --
please check delimiter
> generate global dict using pre-defined dict from external column file
> ---------------------------------------------------------------------
>
> Key: CARBONDATA-35
> URL: https://issues.apache.org/jira/browse/CARBONDATA-35
> Project: CarbonData
> Issue Type: New Feature
> Reporter: Jay
> Priority: Minor
>
> user can set colName:columnfilePath in load DML, which can provide small
> amount of distinct values, then carbon can use these distinct values to
> generate dictionary and avoid reading from large raw csv file. this is a new
> feature and can improve the performance.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)