[
https://issues.apache.org/jira/browse/CARBONDATA-35?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15360965#comment-15360965
]
ASF GitHub Bot commented on CARBONDATA-35:
------------------------------------------
Github user QiangCai commented on a diff in the pull request:
https://github.com/apache/incubator-carbondata/pull/16#discussion_r69418836
--- Diff:
integration/spark/src/main/scala/org/carbondata/spark/rdd/CarbonGlobalDictionaryRDD.scala
---
@@ -321,3 +327,108 @@ class CarbonGlobalDictionaryGenerateRDD(
iter
}
}
+
+/**
+ * Set column dictionry patition format
+ *
+ * @param id partition id
+ * @param dimension current carbon dimension
+ */
+class CarbonColumnDictPatition(id: Int, dimension: CarbonDimension)
+ extends Partition {
+ override val index: Int = id
+ val preDefDictDimension = dimension
+}
+
+/**
+ * Use external column dict to generate global dictionary
+ *
+ * @param carbonLoadModel carbon load model
+ * @param sparkContext spark context
+ * @param table carbon table identifier
+ * @param dimensions carbon dimenisons having predefined dict
+ * @param hdfsLocation carbon base store path
+ * @param dictFolderPath path of dictionary folder
+ */
+class CarbonColumnDictGenerateRDD(carbonLoadModel: CarbonLoadModel,
+ sparkContext: SparkContext,
+ table: CarbonTableIdentifier,
+ dimensions: Array[CarbonDimension],
+ hdfsLocation: String,
+ dictFolderPath: String)
+ extends RDD[(Int, ColumnDistinctValues)](sparkContext, Nil) with Logging
{
+
+ override def getPartitions: Array[Partition] = {
+ val primDimensionsBuffer = new ArrayBuffer[CarbonDimension]
+ for (dimension <- dimensions) {
+ val dims = getPrimDimensionWithDict(carbonLoadModel, dimension, true)
+ primDimensionsBuffer ++= dims
+ }
+ val primDimensions = primDimensionsBuffer.toArray
+ val primDimLength = primDimensions.length
+ val result = new Array[Partition](primDimLength)
+ var primColStarIndex = 0
+ for (i <- 0 until primDimLength) {
+ result(i) = new CarbonColumnDictPatition(i, primDimensions(i))
+ }
+ result
+ }
+
+ override def compute(split: Partition, context: TaskContext)
+ : Iterator[(Int, ColumnDistinctValues)] = {
+ val theSplit = split.asInstanceOf[CarbonColumnDictPatition]
+ val primDimension = theSplit.preDefDictDimension
+ // read the column dict data
+ val preDefDictFilePath =
carbonLoadModel.getPredefDictFilePath(primDimension)
+ var csvReader: CSVReader = null
+ var inputStream: DataInputStream = null
+ var colDictData: java.util.Iterator[Array[String]] = null
+ try {
+ inputStream = FileFactory.getDataInputStream(preDefDictFilePath,
+ FileFactory.getFileType(preDefDictFilePath))
+ csvReader = new CSVReader(new InputStreamReader(inputStream,
Charset.defaultCharset),
+ CSVReader.DEFAULT_SKIP_LINES, new CSVParser())
+ // read the column data to list iterator
+ colDictData = csvReader.readAll.iterator
+ } catch {
+ case ex: Exception =>
+ throw new DataLoadingException(s"Error in reading pre-defined " +
+ s"dictionary file:${ex.getMessage}")
+ } finally {
+ if (csvReader != null) {
+ try {
+ csvReader.close
+ } catch {
+ case ex: Exception =>
+ throw new DataLoadingException(s"Error in closing csvReader of
" +
--- End diff --
no need to throw exception.
just need to log messages.
> generate global dict using pre-defined dict from external column file
> ---------------------------------------------------------------------
>
> Key: CARBONDATA-35
> URL: https://issues.apache.org/jira/browse/CARBONDATA-35
> Project: CarbonData
> Issue Type: New Feature
> Reporter: Jay
> Priority: Minor
>
> user can set colName:columnfilePath in load DML, which can provide small
> amount of distinct values, then carbon can use these distinct values to
> generate dictionary and avoid reading from large raw csv file. this is a new
> feature and can improve the performance.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)