[CARBONDATA-2434] Add ExternalTableExample and LuceneDataMapExample For preparing 1.4.0 release.
This closes #2268 Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/d5da9a19 Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/d5da9a19 Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/d5da9a19 Branch: refs/heads/spark-2.3 Commit: d5da9a19434b9a28912110dac422cc89630ac93e Parents: 09feb9c Author: chenliang613 <chenliang...@huawei.com> Authored: Fri May 4 10:35:52 2018 +0800 Committer: ravipesala <ravi.pes...@gmail.com> Committed: Wed May 9 00:01:46 2018 +0530 ---------------------------------------------------------------------- .../examples/ExternalTableExample.scala | 104 +++++++++++++++++ .../examples/LuceneDataMapExample.scala | 116 +++++++++++++++++++ .../examples/PreAggregateDataMapExample.scala | 12 +- .../carbondata/examplesCI/RunExamples.scala | 8 ++ 4 files changed, 234 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/carbondata/blob/d5da9a19/examples/spark2/src/main/scala/org/apache/carbondata/examples/ExternalTableExample.scala ---------------------------------------------------------------------- diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/ExternalTableExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/ExternalTableExample.scala new file mode 100644 index 0000000..9d5ee8e --- /dev/null +++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/ExternalTableExample.scala @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.examples + +import java.io.File + +import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession} + +import org.apache.carbondata.core.constants.CarbonCommonConstants +import org.apache.carbondata.core.metadata.CarbonTableIdentifier +import org.apache.carbondata.core.util.CarbonProperties +import org.apache.carbondata.examples.util.ExampleUtils + +/** + * This example is for showing how to create external table with location. + */ + +object ExternalTableExample { + + def main(args: Array[String]) { + val spark = ExampleUtils.createCarbonSession("ExternalTableExample") + exampleBody(spark) + spark.close() + } + + def exampleBody(spark : SparkSession): Unit = { + + CarbonProperties.getInstance() + .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd") + + // Create origin_table + spark.sql("DROP TABLE IF EXISTS origin_table") + spark.sql( + s""" + | CREATE TABLE origin_table( + | shortField SHORT, + | intField INT, + | bigintField LONG, + | doubleField DOUBLE, + | stringField STRING, + | timestampField TIMESTAMP, + | decimalField DECIMAL(18,2), + | dateField DATE, + | charField CHAR(5), + | floatField FLOAT + | ) + | STORED BY 'carbondata' + """.stripMargin) + + val rootPath = new File(this.getClass.getResource("/").getPath + + "../../../..").getCanonicalPath + val path = s"$rootPath/examples/spark2/src/main/resources/data.csv" + + // load 4 times, each load has 10 rows data + // scalastyle:off + (1 to 4).foreach(_ => spark.sql( + s""" + | LOAD DATA LOCAL INPATH '$path' + | INTO TABLE origin_table + | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#') + """.stripMargin)) + // scalastyle:on + + // 40 rows + spark.sql("SELECT count(*) FROM origin_table").show() + + val origin_table_path = CarbonEnv.getTablePath(Some("default"), "origin_table")(spark) + + // Create external_table + spark.sql("DROP TABLE IF EXISTS external_table") + spark.sql("CREATE EXTERNAL TABLE external_table STORED BY 'carbondata'" + + s" LOCATION '$origin_table_path'") + spark.sql("SELECT count(*) FROM external_table").show() + + // Load 2 times again + (1 to 2).foreach(_ => spark.sql( + s""" + | LOAD DATA LOCAL INPATH '$path' + | INTO TABLE origin_table + | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#') + """.stripMargin)) + + spark.sql("SELECT count(*) FROM external_table").show() + + // Drop tables + spark.sql("DROP TABLE IF EXISTS origin_table") + spark.sql("DROP TABLE IF EXISTS external_table") + } +} http://git-wip-us.apache.org/repos/asf/carbondata/blob/d5da9a19/examples/spark2/src/main/scala/org/apache/carbondata/examples/LuceneDataMapExample.scala ---------------------------------------------------------------------- diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/LuceneDataMapExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/LuceneDataMapExample.scala new file mode 100644 index 0000000..efe2a63 --- /dev/null +++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/LuceneDataMapExample.scala @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.examples + +import org.apache.spark.sql.{SaveMode, SparkSession} + +import org.apache.carbondata.examples.util.ExampleUtils + + +/** + * This example is for lucene datamap. + */ + +object LuceneDataMapExample { + + def main(args: Array[String]) { + val spark = ExampleUtils.createCarbonSession("LuceneDataMapExample") + exampleBody(spark) + spark.close() + } + + def exampleBody(spark : SparkSession): Unit = { + + // build the test data, please increase the data for more obvious comparison. + // if set the data is larger than 100M, it will take 10+ mins. + import scala.util.Random + + import spark.implicits._ + val r = new Random() + val df = spark.sparkContext.parallelize(1 to 10 * 10 * 1000) + .map(x => ("which test" + r.nextInt(10000) + " good" + r.nextInt(10), + "who and name" + x % 8, "city" + x % 50, x % 60)) + .toDF("id", "name", "city", "age") + + spark.sql("DROP TABLE IF EXISTS personTable") + df.write.format("carbondata") + .option("tableName", "personTable") + .option("compress", "true") + .mode(SaveMode.Overwrite).save() + + // create lucene datamap on personTable + spark.sql( + s""" + | CREATE DATAMAP IF NOT EXISTS dm ON TABLE personTable + | USING 'lucene' + | DMProperties('INDEX_COLUMNS'='id , name') + """.stripMargin) + + spark.sql("refresh datamap dm ON TABLE personTable") + + // 1. Compare the performance: + + def time(code: => Unit): Double = { + val start = System.currentTimeMillis() + code + // return time in second + (System.currentTimeMillis() - start).toDouble / 1000 + } + + val time_without_lucenedatamap = time { + + spark.sql( + s""" + | SELECT count(*) + | FROM personTable where id like '% test1 %' + """.stripMargin).show() + + } + + val time_with_lucenedatamap = time { + + spark.sql( + s""" + | SELECT count(*) + | FROM personTable where TEXT_MATCH('id:test1') + """.stripMargin).show() + + } + + // scalastyle:off + println("time for query on table with lucene datamap table:" + time_with_lucenedatamap.toString) + println("time for query on table without lucene datamap table:" + time_without_lucenedatamap.toString) + // scalastyle:on + + // 2. Search for word "test1" and not "good" in the id field + spark.sql( + s""" + | SELECT id,name + | FROM personTable where TEXT_MATCH('id:test1 -id:good1') + """.stripMargin).show(100) + + // 3. TEXT_MATCH_WITH_LIMIT usage: + spark.sql( + s""" + | SELECT id,name + | FROM personTable where TEXT_MATCH_WITH_LIMIT('id:test1',10) + """.stripMargin).show() + + spark.sql("DROP TABLE IF EXISTS personTable") + } +} http://git-wip-us.apache.org/repos/asf/carbondata/blob/d5da9a19/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala ---------------------------------------------------------------------- diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala index 367c011..b008bd4 100644 --- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala +++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala @@ -65,10 +65,6 @@ object PreAggregateDataMapExample { LOAD DATA LOCAL INPATH '$testData' into table mainTable """) - spark.sql(""" - select * from mainTable - """) - spark.sql(s""" LOAD DATA LOCAL INPATH '$testData' into table mainTable_other """) @@ -152,16 +148,20 @@ object PreAggregateDataMapExample { // 2.compare the performance : with pre-aggregate VS main table - // build test data, if set the data is larger than 100M, it will take 10+ mins. + // build the test data, please increase the data for more obvious comparison. + // if set the data is larger than 100M, it will take 10+ mins. + import spark.implicits._ import scala.util.Random val r = new Random() - val df = spark.sparkContext.parallelize(1 to 10 * 1000 * 1000) + val df = spark.sparkContext.parallelize(1 to 10 * 10 * 1000) .map(x => ("No." + r.nextInt(100000), "name" + x % 8, "city" + x % 50, x % 60)) .toDF("ID", "name", "city", "age") // Create table with pre-aggregate + spark.sql("DROP TABLE IF EXISTS personTable") + spark.sql("DROP TABLE IF EXISTS personTableWithoutAgg") df.write.format("carbondata") .option("tableName", "personTable") .option("compress", "true") http://git-wip-us.apache.org/repos/asf/carbondata/blob/d5da9a19/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala ---------------------------------------------------------------------- diff --git a/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala b/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala index ddf8ee5..2b9b999 100644 --- a/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala +++ b/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala @@ -105,4 +105,12 @@ class RunExamples extends QueryTest with BeforeAndAfterAll { test("TimeSeriesPreAggregateTableExample") { TimeSeriesPreAggregateTableExample.exampleBody(spark) } + + test("LuceneDataMapExample") { + LuceneDataMapExample.exampleBody(spark) + } + + test("ExternalTableExample") { + ExternalTableExample.exampleBody(spark) + } } \ No newline at end of file