[ 
https://issues.apache.org/jira/browse/CARBONDATA-35?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15361148#comment-15361148
 ] 

ASF GitHub Bot commented on CARBONDATA-35:
------------------------------------------

Github user QiangCai commented on a diff in the pull request:

    https://github.com/apache/incubator-carbondata/pull/16#discussion_r69440262
  
    --- Diff: 
integration/spark/src/test/scala/org/carbondata/spark/util/ExternalColumnDictionaryTestCase.scala
 ---
    @@ -0,0 +1,124 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing,
    + * software distributed under the License is distributed on an
    + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    + * KIND, either express or implied.  See the License for the
    + * specific language governing permissions and limitations
    + * under the License.
    + */
    +package org.carbondata.spark.util
    +
    +import java.io.File
    +
    +import org.apache.spark.sql.{CarbonEnv, CarbonRelation}
    +import org.apache.spark.sql.common.util.CarbonHiveContext
    +import org.apache.spark.sql.common.util.CarbonHiveContext.sql
    +import org.apache.spark.sql.common.util.QueryTest
    +
    +import org.carbondata.core.carbon.CarbonDataLoadSchema
    +import org.carbondata.spark.load.CarbonLoadModel
    +
    +import org.scalatest.BeforeAndAfterAll
    +
    +/**
    + * test case for external column dictionary generation
    + * also support complicated type
    + */
    +class ExternalColumnDictionaryTestCase extends QueryTest with 
BeforeAndAfterAll {
    +
    +  var extComplexRelation: CarbonRelation = _
    +  var filePath: String = _
    +  var pwd: String = _
    +  var complexFilePath: String = _
    +  var extColDictFilePath: String = _
    +  var header: String = _
    +
    +  def buildTestData() = {
    +    pwd = new File(this.getClass.getResource("/").getPath + "/../../").
    +      getCanonicalPath.replace("\\", "/")
    +    filePath = pwd + "/src/test/resources/sample.csv"
    +    complexFilePath = pwd + "/src/test/resources/complexdata2.csv"
    +    extColDictFilePath = "deviceInformationId:" + pwd +
    +      "/src/test/resources/deviceInformationId.csv," +
    +      "mobile.imei:" + pwd + "/src/test/resources/mobileimei.csv," +
    +      "mac:" + pwd + "/src/test/resources/mac.csv," +
    +      "locationInfo.ActiveCountry:" + pwd + 
"/src/test/resources/locationInfoActiveCountry.csv"
    +    header = 
"deviceInformationId,channelsId,ROMSize,purchasedate,mobile,MAC," +
    +      "locationinfo,proddate,gamePointId,contractNumber"
    +  }
    +
    +  def buildTable() = {
    +    try {
    +      sql("""CREATE TABLE extComplextypes (deviceInformationId int,
    +       channelsId string, ROMSize string, purchasedate string,
    +       mobile struct<imei:string, imsi:string>, MAC array<string>,
    +       locationinfo array<struct<ActiveAreaId:int, ActiveCountry:string,
    +       ActiveProvince:string, Activecity:string, ActiveDistrict:string, 
ActiveStreet:string>>,
    +       proddate 
struct<productionDate:string,activeDeactivedate:array<string>>,
    +       gamePointId double,contractNumber double)
    +       STORED BY 'org.apache.carbondata.format'
    +       TBLPROPERTIES('DICTIONARY_INCLUDE' = 'deviceInformationId')
    +          """)
    +    } catch {
    +      case ex: Throwable => logError(ex.getMessage + "\r\n" + 
ex.getStackTraceString)
    +    }
    +  }
    +
    +  def buildRelation() = {
    +    val catalog = CarbonEnv.getInstance(CarbonHiveContext).carbonCatalog
    +    extComplexRelation = catalog.lookupRelation1(None, "extComplextypes", 
None)(CarbonHiveContext)
    +      .asInstanceOf[CarbonRelation]
    +  }
    +  def buildCarbonLoadModel(relation: CarbonRelation,
    +                           filePath:String,
    +                           header: String,
    +                           extColFilePath: String): CarbonLoadModel = {
    +    val carbonLoadModel = new CarbonLoadModel
    +    
carbonLoadModel.setTableName(relation.cubeMeta.carbonTableIdentifier.getDatabaseName)
    +    
carbonLoadModel.setDatabaseName(relation.cubeMeta.carbonTableIdentifier.getTableName)
    +    val table = relation.cubeMeta.carbonTable
    +    val carbonSchema = new CarbonDataLoadSchema(table)
    +    carbonLoadModel.setDatabaseName(table.getDatabaseName)
    +    carbonLoadModel.setTableName(table.getFactTableName)
    +    carbonLoadModel.setCarbonDataLoadSchema(carbonSchema)
    +    carbonLoadModel.setFactFilePath(filePath)
    +    carbonLoadModel.setCsvHeader(header)
    +    carbonLoadModel.setCsvDelimiter(",")
    +    carbonLoadModel.setComplexDelimiterLevel1("\\$")
    +    carbonLoadModel.setComplexDelimiterLevel2("\\:")
    +    carbonLoadModel.setColDictFilePath(extColFilePath)
    +    carbonLoadModel
    +  }
    +
    +  override def beforeAll {
    +    buildTestData
    +    buildTable
    +    buildRelation
    +  }
    +
    +  test("[issue-126]Generate global dictionary from external column file") {
    +    // test external column file to generate global dict
    +    val carbonLoadModel = buildCarbonLoadModel(extComplexRelation, 
complexFilePath,
    +      header, extColDictFilePath)
    +    GlobalDictionaryUtil.generateGlobalDictionary(CarbonHiveContext, 
carbonLoadModel,
    +      extComplexRelation.cubeMeta.storePath)
    +
    +    // check whether the dictionary is generated
    +    DictionaryTestCaseUtil.checkDictionary(
    +      extComplexRelation, "deviceInformationId", "10086")
    --- End diff --
    
    please  add incremental test case


> generate global dict using pre-defined dict from external column file
> ---------------------------------------------------------------------
>
>                 Key: CARBONDATA-35
>                 URL: https://issues.apache.org/jira/browse/CARBONDATA-35
>             Project: CarbonData
>          Issue Type: New Feature
>            Reporter: Jay
>            Priority: Minor
>
> user can set colName:columnfilePath in load DML, which can provide small 
> amount of distinct values, then carbon can use these distinct values to 
> generate dictionary and avoid reading from large raw csv file. this is a new 
> feature and can improve the performance.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to