[ 
https://issues.apache.org/jira/browse/CARBONDATA-35?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15361103#comment-15361103
 ] 

ASF GitHub Bot commented on CARBONDATA-35:
------------------------------------------

Github user QiangCai commented on a diff in the pull request:

    https://github.com/apache/incubator-carbondata/pull/16#discussion_r69433879
  
    --- Diff: 
integration/spark/src/main/scala/org/carbondata/spark/rdd/CarbonGlobalDictionaryRDD.scala
 ---
    @@ -321,3 +327,108 @@ class CarbonGlobalDictionaryGenerateRDD(
         iter
       }
     }
    +
    +/**
    + *  Set column dictionry patition format
    +  *
    +  * @param id  partition id
    + * @param dimension  current carbon dimension
    + */
    +class CarbonColumnDictPatition(id: Int, dimension: CarbonDimension)
    +  extends Partition {
    +  override val index: Int = id
    +  val preDefDictDimension = dimension
    +}
    +
    +/**
    + * Use external column dict to generate global dictionary
    +  *
    +  * @param carbonLoadModel  carbon load model
    + * @param sparkContext  spark context
    + * @param table  carbon table identifier
    + * @param dimensions  carbon dimenisons having predefined dict
    + * @param hdfsLocation  carbon base store path
    + * @param dictFolderPath  path of dictionary folder
    + */
    +class CarbonColumnDictGenerateRDD(carbonLoadModel: CarbonLoadModel,
    +       sparkContext: SparkContext,
    +       table: CarbonTableIdentifier,
    +       dimensions: Array[CarbonDimension],
    +       hdfsLocation: String,
    +       dictFolderPath: String)
    +  extends RDD[(Int, ColumnDistinctValues)](sparkContext, Nil) with Logging 
{
    +
    +  override def getPartitions: Array[Partition] = {
    +    val primDimensionsBuffer = new ArrayBuffer[CarbonDimension]
    +    for (dimension <- dimensions) {
    +      val dims = getPrimDimensionWithDict(carbonLoadModel, dimension, true)
    +      primDimensionsBuffer ++= dims
    +    }
    +    val primDimensions = primDimensionsBuffer.toArray
    +    val primDimLength = primDimensions.length
    +    val result = new Array[Partition](primDimLength)
    +    var primColStarIndex = 0
    +    for (i <- 0 until primDimLength) {
    +      result(i) = new CarbonColumnDictPatition(i, primDimensions(i))
    +    }
    +    result
    +  }
    +
    +  override def compute(split: Partition, context: TaskContext)
    +  : Iterator[(Int, ColumnDistinctValues)] = {
    +    val theSplit = split.asInstanceOf[CarbonColumnDictPatition]
    +    val primDimension = theSplit.preDefDictDimension
    +    // read the column dict data
    +    val preDefDictFilePath = 
carbonLoadModel.getPredefDictFilePath(primDimension)
    +    var csvReader: CSVReader = null
    +    var inputStream: DataInputStream = null
    +    var colDictData: java.util.Iterator[Array[String]] = null
    +    try {
    +      inputStream = FileFactory.getDataInputStream(preDefDictFilePath,
    +        FileFactory.getFileType(preDefDictFilePath))
    +      csvReader = new CSVReader(new InputStreamReader(inputStream, 
Charset.defaultCharset),
    --- End diff --
    
    please check delimiter


> generate global dict using pre-defined dict from external column file
> ---------------------------------------------------------------------
>
>                 Key: CARBONDATA-35
>                 URL: https://issues.apache.org/jira/browse/CARBONDATA-35
>             Project: CarbonData
>          Issue Type: New Feature
>            Reporter: Jay
>            Priority: Minor
>
> user can set colName:columnfilePath in load DML, which can provide small 
> amount of distinct values, then carbon can use these distinct values to 
> generate dictionary and avoid reading from large raw csv file. this is a new 
> feature and can improve the performance.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to