[ 
https://issues.apache.org/jira/browse/CARBONDATA-35?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15360965#comment-15360965
 ] 

ASF GitHub Bot commented on CARBONDATA-35:
------------------------------------------

Github user QiangCai commented on a diff in the pull request:

    https://github.com/apache/incubator-carbondata/pull/16#discussion_r69418836
  
    --- Diff: 
integration/spark/src/main/scala/org/carbondata/spark/rdd/CarbonGlobalDictionaryRDD.scala
 ---
    @@ -321,3 +327,108 @@ class CarbonGlobalDictionaryGenerateRDD(
         iter
       }
     }
    +
    +/**
    + *  Set column dictionry patition format
    +  *
    +  * @param id  partition id
    + * @param dimension  current carbon dimension
    + */
    +class CarbonColumnDictPatition(id: Int, dimension: CarbonDimension)
    +  extends Partition {
    +  override val index: Int = id
    +  val preDefDictDimension = dimension
    +}
    +
    +/**
    + * Use external column dict to generate global dictionary
    +  *
    +  * @param carbonLoadModel  carbon load model
    + * @param sparkContext  spark context
    + * @param table  carbon table identifier
    + * @param dimensions  carbon dimenisons having predefined dict
    + * @param hdfsLocation  carbon base store path
    + * @param dictFolderPath  path of dictionary folder
    + */
    +class CarbonColumnDictGenerateRDD(carbonLoadModel: CarbonLoadModel,
    +       sparkContext: SparkContext,
    +       table: CarbonTableIdentifier,
    +       dimensions: Array[CarbonDimension],
    +       hdfsLocation: String,
    +       dictFolderPath: String)
    +  extends RDD[(Int, ColumnDistinctValues)](sparkContext, Nil) with Logging 
{
    +
    +  override def getPartitions: Array[Partition] = {
    +    val primDimensionsBuffer = new ArrayBuffer[CarbonDimension]
    +    for (dimension <- dimensions) {
    +      val dims = getPrimDimensionWithDict(carbonLoadModel, dimension, true)
    +      primDimensionsBuffer ++= dims
    +    }
    +    val primDimensions = primDimensionsBuffer.toArray
    +    val primDimLength = primDimensions.length
    +    val result = new Array[Partition](primDimLength)
    +    var primColStarIndex = 0
    +    for (i <- 0 until primDimLength) {
    +      result(i) = new CarbonColumnDictPatition(i, primDimensions(i))
    +    }
    +    result
    +  }
    +
    +  override def compute(split: Partition, context: TaskContext)
    +  : Iterator[(Int, ColumnDistinctValues)] = {
    +    val theSplit = split.asInstanceOf[CarbonColumnDictPatition]
    +    val primDimension = theSplit.preDefDictDimension
    +    // read the column dict data
    +    val preDefDictFilePath = 
carbonLoadModel.getPredefDictFilePath(primDimension)
    +    var csvReader: CSVReader = null
    +    var inputStream: DataInputStream = null
    +    var colDictData: java.util.Iterator[Array[String]] = null
    +    try {
    +      inputStream = FileFactory.getDataInputStream(preDefDictFilePath,
    +        FileFactory.getFileType(preDefDictFilePath))
    +      csvReader = new CSVReader(new InputStreamReader(inputStream, 
Charset.defaultCharset),
    +        CSVReader.DEFAULT_SKIP_LINES, new CSVParser())
    +      // read the column data to list iterator
    +      colDictData = csvReader.readAll.iterator
    +    } catch {
    +        case ex: Exception =>
    +          throw new DataLoadingException(s"Error in reading pre-defined " +
    +            s"dictionary file:${ex.getMessage}")
    +    } finally {
    +      if (csvReader != null) {
    +        try {
    +          csvReader.close
    +        } catch {
    +          case ex: Exception =>
    +            throw new DataLoadingException(s"Error in closing csvReader of 
" +
    --- End diff --
    
    no need to throw exception.
    just need to log messages.


> generate global dict using pre-defined dict from external column file
> ---------------------------------------------------------------------
>
>                 Key: CARBONDATA-35
>                 URL: https://issues.apache.org/jira/browse/CARBONDATA-35
>             Project: CarbonData
>          Issue Type: New Feature
>            Reporter: Jay
>            Priority: Minor
>
> user can set colName:columnfilePath in load DML, which can provide small 
> amount of distinct values, then carbon can use these distinct values to 
> generate dictionary and avoid reading from large raw csv file. this is a new 
> feature and can improve the performance.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to