Github user ManoharVanam commented on a diff in the pull request:
https://github.com/apache/carbondata/pull/1670#discussion_r157472681
--- Diff:
examples/spark2/src/main/scala/org/apache/carbondata/examples/ConcurrencyTest.scala
---
@@ -0,0 +1,352 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.examples
+
+import java.io.File
+import java.util
+import java.util.concurrent.{Callable, Executors, Future, TimeUnit}
+
+import scala.util.Random
+
+import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
+import org.apache.spark.sql.types._
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants
+import org.apache.carbondata.core.util.CarbonProperties
+
+// scalastyle:off println
+object ConcurrencyTest {
+
+ var totalNum = 10 * 1000 * 1 * 10
+ var ThreadNum = 16
+ var TaskNum = 100
+ var ResultIsEmpty = true
+ val cardinalityId = 10000 * 10000
+ val cardinalityCity = 6
+
+ def parquetTableName: String = "comparetest_parquet"
+
+ def orcTableName: String = "comparetest_orc"
+
+ def carbonTableName(version: String): String =
s"comparetest_carbonV$version"
+
+ // Table schema:
+ // +-------------+-----------+-------------+-------------+------------+
+ // | id | string | 10,000,000 | dimension | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | Column name | Data type | Cardinality | Column type | Dictionary |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | city | string | 6 | dimension | yes |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | country | string | 6 | dimension | yes |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | planet | string | 100,007 | dimension | yes |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m1 | short | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m2 | int | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m3 | big int | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m4 | double | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+ // | m5 | decimal | NA | measure | no |
+ // +-------------+-----------+-------------+-------------+------------+
+
+ private def generateDataFrame(spark: SparkSession): DataFrame = {
+ val rdd = spark.sparkContext
+ .parallelize(1 to totalNum, 4)
+ .map { x =>
+ ((x % 100000000).toString, "city" + x % 6, "country" + x % 6,
"planet" + x % 10007,
+ (x % 16).toShort, x / 2, (x << 1).toLong, x.toDouble / 13,
+ BigDecimal.valueOf(x.toDouble / 11))
+ }.map { x =>
+ Row(x._1, x._2, x._3, x._4, x._5, x._6, x._7, x._8, x._9)
+ }
+
+ val schema = StructType(
+ Seq(
+ StructField("id", StringType, nullable = false),
+ StructField("city", StringType, nullable = false),
+ StructField("country", StringType, nullable = false),
+ StructField("planet", StringType, nullable = false),
+ StructField("m1", ShortType, nullable = false),
+ StructField("m2", IntegerType, nullable = false),
+ StructField("m3", LongType, nullable = false),
+ StructField("m4", DoubleType, nullable = false),
+ StructField("m5", DecimalType(30, 10), nullable = false)
+ )
+ )
+
+ spark.createDataFrame(rdd, schema)
+ }
+
+ // performance test queries, they are designed to test various data
access type
+ val r = new Random()
+ val tmpId = r.nextInt(cardinalityId) % totalNum
+ val tmpCity = "city" + (r.nextInt(cardinalityCity) % totalNum)
+ val queries: Array[Query] = Array(
+ Query(
+ "select * from $table" + s" where id = '$tmpId' ",
+ "filter scan",
+ "filter on high card dimension"
+ ),
+
+ Query(
+ "select id from $table" + s" where id = '$tmpId' ",
+ "filter scan",
+ "filter on high card dimension"
+ ),
+
+ Query(
+ "select * from $table" + s" where city = '$tmpCity' ",
+ "filter scan",
+ "filter on high card dimension"
+ ),
+
+ Query(
+ "select city from $table" + s" where city = '$tmpCity' ",
+ "filter scan",
+ "filter on high card dimension"
+ ),
+
+ Query(
+ "select country, sum(m1) from $table group by country",
+ "aggregate",
+ "group by on big data, on medium card column, medium result set,"
+ ),
+
+ Query(
+ "select country, sum(m1) from $table" +
+ s" where id = '$tmpId' group by country",
+ "aggregate",
+ "group by on big data, on medium card column, medium result set,"
+ ),
+
+ Query(
+ "select t1.country, sum(t1.m1) from $table t1 join $table t2"
+ + s" on t1.id = t2.id where t1.id = '$tmpId' group by t1.country",
+ "aggregate",
+ "group by on big data, on medium card column, medium result set,"
+ )
+ ,
+ Query(
+ "select t2.country, sum(t2.m1) " +
+ "from $table t1 join $table t2 join $table t3 " +
+ "join $table t4 join $table t5 join $table t6 join $table t7 " +
+ s"on t1.id=t2.id and t1.id=t3.id and t1.id=t4.id " +
+ s"and t1.id=t5.id and t1.id=t6.id and " +
+ s"t1.id=t7.id " +
+ s" where t2.id = '$tmpId' " +
+ s" group by t2.country",
+ "aggregate",
+ "group by on big data, on medium card column, medium result set,"
+ )
+ )
+
+ private def loadParquetTable(spark: SparkSession, input: DataFrame,
table: String)
+ : Double = time {
+ // partitioned by last 1 digit of id column
+ val dfWithPartition = input.withColumn("partitionCol",
input.col("id").%(10))
+ dfWithPartition.write
+ .partitionBy("partitionCol")
+ .mode(SaveMode.Overwrite)
+ .parquet(table)
+ spark.read.parquet(table).createOrReplaceTempView(table)
+ }
+
+ private def loadOrcTable(spark: SparkSession, input: DataFrame, table:
String): Double = time {
+ // partitioned by last 1 digit of id column
+ input.write
+ .mode(SaveMode.Overwrite)
+ .orc(table)
+ spark.read.orc(table).createOrReplaceTempView(table)
+ }
+
+ private def loadCarbonTable(spark: SparkSession, input: DataFrame,
tableName: String): Double = {
+ CarbonProperties.getInstance().addProperty(
+ CarbonCommonConstants.CARBON_DATA_FILE_VERSION,
+ "3"
+ )
+ spark.sql(s"drop table if exists $tableName")
+ time {
+ input.write
+ .format("carbondata")
+ .option("tableName", tableName)
+ .option("tempCSV", "false")
+ .option("single_pass", "true")
+ .option("dictionary_exclude", "id") // id is high cardinality
column
+ .option("table_blocksize", "32")
+ .mode(SaveMode.Overwrite)
+ .save()
+ }
+ }
+
+ // load data into parquet, carbonV2, carbonV3
+ def prepareTable(spark: SparkSession, table1: String, table2: String):
Unit = {
+ val df = generateDataFrame(spark).cache
+ println(s"generating ${df.count} records, schema: ${df.schema}")
+ val table1Time = if (table1.endsWith("parquet")) {
+ loadParquetTable(spark, df, table1)
+ } else if (table1.endsWith("orc")) {
+ loadOrcTable(spark, df, table1)
+ } else {
+ sys.error("invalid table: " + table1)
+ }
+ val table2Time = loadCarbonTable(spark, df, table2)
+ println(s"load completed, time: $table1Time, $table2Time")
+ df.unpersist()
+ }
+
+ // Run all queries for the specified table
+ private def runQueries(spark: SparkSession, tableName: String): Unit = {
+ println(s"start running queries for $tableName...")
+ val start = System.currentTimeMillis()
+ println("90% time: xx.xx sec\t99% time: xx.xx sec\tlast time: xx.xx
sec\t " +
+ "running query sql\taverage time: xx.xx sec\t result: show it when
ResultIsEmpty is false")
+ queries.zipWithIndex.map { case (query, index) =>
+ val sqlText = query.sqlText.replace("$table", tableName)
+
+ val executorService = Executors.newFixedThreadPool(ThreadNum)
+ val results = new util.ArrayList[Future[Results]]()
+ for (num <- (1 to TaskNum).par) {
+ results.add(executorService.submit(new QueryTask(spark, sqlText)))
--- End diff --
For better concurrency, I think invokeAll is better option instead of
submitting task every time
---