[17/20] carbondata-site git commit: Updated changes for 1.5.0 release

chenliang613 Wed, 17 Oct 2018 03:21:53 -0700
http://git-wip-us.apache.org/repos/asf/carbondata-site/blob/6f8949f1/content/useful-tips-on-carbondata.html
----------------------------------------------------------------------
diff --git a/content/useful-tips-on-carbondata.html 
b/content/useful-tips-on-carbondata.html
new file mode 100644
index 0000000..912cb48
--- /dev/null
+++ b/content/useful-tips-on-carbondata.html
@@ -0,0 +1,480 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <link href='images/favicon.ico' rel='shortcut icon' type='image/x-icon'>
+    <!-- The above 3 meta tags *must* come first in the head; any other head 
content must come *after* these tags -->
+    <title>CarbonData</title>
+    <style>
+
+    </style>
+    <!-- Bootstrap -->
+
+    <link rel="stylesheet" href="css/bootstrap.min.css">
+    <link href="css/style.css" rel="stylesheet">
+    <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media 
queries -->
+    <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
+    <!--[if lt IE 9]>
+    <script 
src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js";></script>
+    <script 
src="https://oss.maxcdn.scom/respond/1.4.2/respond.min.js";></script>
+    <![endif]-->
+    <script src="js/jquery.min.js"></script>
+    <script src="js/bootstrap.min.js"></script>
+
+
+</head>
+<body>
+<header>
+    <nav class="navbar navbar-default navbar-custom cd-navbar-wrapper">
+        <div class="container">
+            <div class="navbar-header">
+                <button aria-controls="navbar" aria-expanded="false" 
data-target="#navbar" data-toggle="collapse"
+                        class="navbar-toggle collapsed" type="button">
+                    <span class="sr-only">Toggle navigation</span>
+                    <span class="icon-bar"></span>
+                    <span class="icon-bar"></span>
+                    <span class="icon-bar"></span>
+                </button>
+                <a href="index.html" class="logo">
+                    <img src="images/CarbonDataLogo.png" alt="CarbonData logo" 
title="CarbocnData logo"/>
+                </a>
+            </div>
+            <div class="navbar-collapse collapse cd_navcontnt" id="navbar">
+                <ul class="nav navbar-nav navbar-right navlist-custom">
+                    <li><a href="index.html" class="hidden-xs"><i class="fa 
fa-home" aria-hidden="true"></i> </a>
+                    </li>
+                    <li><a href="index.html" class="hidden-lg hidden-md 
hidden-sm">Home</a></li>
+                    <li class="dropdown">
+                        <a href="#" class="dropdown-toggle " 
data-toggle="dropdown" role="button" aria-haspopup="true"
+                           aria-expanded="false"> Download <span 
class="caret"></span></a>
+                        <ul class="dropdown-menu">
+                            <li>
+                                <a 
href="https://dist.apache.org/repos/dist/release/carbondata/1.4.1/";
+                                   target="_blank">Apache CarbonData 
1.4.1</a></li>
+                                                       <li>
+                                <a 
href="https://dist.apache.org/repos/dist/release/carbondata/1.4.0/";
+                                   target="_blank">Apache CarbonData 
1.4.0</a></li>
+                            <li>
+                                <a 
href="https://dist.apache.org/repos/dist/release/carbondata/1.3.1/";
+                                   target="_blank">Apache CarbonData 
1.3.1</a></li>
+                            <li>
+                                <a 
href="https://dist.apache.org/repos/dist/release/carbondata/1.3.0/";
+                                   target="_blank">Apache CarbonData 
1.3.0</a></li>
+                            <li>
+                                <a 
href="https://cwiki.apache.org/confluence/display/CARBONDATA/Releases";
+                                   target="_blank">Release Archive</a></li>
+                        </ul>
+                    </li>
+                    <li><a href="mainpage.html" 
class="active">Documentation</a></li>
+                    <li class="dropdown">
+                        <a href="#" class="dropdown-toggle" 
data-toggle="dropdown" role="button" aria-haspopup="true"
+                           aria-expanded="false">Community <span 
class="caret"></span></a>
+                        <ul class="dropdown-menu">
+                            <li>
+                                <a 
href="https://github.com/apache/carbondata/blob/master/docs/How-to-contribute-to-Apache-CarbonData.md";
+                                   target="_blank">Contributing to 
CarbonData</a></li>
+                            <li>
+                                <a 
href="https://github.com/apache/carbondata/blob/master/docs/release-guide.md";
+                                   target="_blank">Release Guide</a></li>
+                            <li>
+                                <a 
href="https://cwiki.apache.org/confluence/display/CARBONDATA/PMC+and+Committers+member+list";
+                                   target="_blank">Project PMC and 
Committers</a></li>
+                            <li>
+                                <a 
href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=66850609";
+                                   target="_blank">CarbonData Meetups</a></li>
+                            <li><a href="security.html">Apache CarbonData 
Security</a></li>
+                            <li><a 
href="https://issues.apache.org/jira/browse/CARBONDATA"; target="_blank">Apache
+                                Jira</a></li>
+                            <li><a href="videogallery.html">CarbonData Videos 
</a></li>
+                        </ul>
+                    </li>
+                    <li class="dropdown">
+                        <a href="http://www.apache.org/"; class="apache_link 
hidden-xs dropdown-toggle"
+                           data-toggle="dropdown" role="button" 
aria-haspopup="true" aria-expanded="false">Apache</a>
+                        <ul class="dropdown-menu">
+                            <li><a href="http://www.apache.org/"; 
target="_blank">Apache Homepage</a></li>
+                            <li><a href="http://www.apache.org/licenses/"; 
target="_blank">License</a></li>
+                            <li><a 
href="http://www.apache.org/foundation/sponsorship.html";
+                                   target="_blank">Sponsorship</a></li>
+                            <li><a 
href="http://www.apache.org/foundation/thanks.html"; 
target="_blank">Thanks</a></li>
+                        </ul>
+                    </li>
+
+                    <li class="dropdown">
+                        <a href="http://www.apache.org/"; class="hidden-lg 
hidden-md hidden-sm dropdown-toggle"
+                           data-toggle="dropdown" role="button" 
aria-haspopup="true" aria-expanded="false">Apache</a>
+                        <ul class="dropdown-menu">
+                            <li><a href="http://www.apache.org/"; 
target="_blank">Apache Homepage</a></li>
+                            <li><a href="http://www.apache.org/licenses/"; 
target="_blank">License</a></li>
+                            <li><a 
href="http://www.apache.org/foundation/sponsorship.html";
+                                   target="_blank">Sponsorship</a></li>
+                            <li><a 
href="http://www.apache.org/foundation/thanks.html"; 
target="_blank">Thanks</a></li>
+                        </ul>
+                    </li>
+
+                    <li>
+                        <a href="#" id="search-icon"><i class="fa fa-search" 
aria-hidden="true"></i></a>
+
+                    </li>
+
+                </ul>
+            </div><!--/.nav-collapse -->
+            <div id="search-box">
+                <form method="get" action="http://www.google.com/search"; 
target="_blank">
+                    <div class="search-block">
+                        <table border="0" cellpadding="0" width="100%">
+                            <tr>
+                                <td style="width:80%">
+                                    <input type="text" name="q" size=" 5" 
maxlength="255" value=""
+                                           class="search-input"  
placeholder="Search...."    required/>
+                                </td>
+                                <td style="width:20%">
+                                    <input type="submit" value="Search"/></td>
+                            </tr>
+                            <tr>
+                                <td align="left" style="font-size:75%" 
colspan="2">
+                                    <input type="checkbox" name="sitesearch" 
value="carbondata.apache.org" checked/>
+                                    <span style=" position: relative; top: 
-3px;"> Only search for CarbonData</span>
+                                </td>
+                            </tr>
+                        </table>
+                    </div>
+                </form>
+            </div>
+        </div>
+    </nav>
+</header> <!-- end Header part -->
+
+<div class="fixed-padding"></div> <!--  top padding with fixde header  -->
+
+<section><!-- Dashboard nav -->
+    <div class="container-fluid q">
+        <div class="col-sm-12  col-md-12 maindashboard">
+            <div class="row">
+                <section>
+                    <div style="padding:10px 15px;">
+                        <div id="viewpage" name="viewpage">
+                            <div class="row">
+                                <div class="col-sm-12  col-md-12">
+                                    <div>
+<h1>
+<a id="useful-tips" class="anchor" href="#useful-tips" 
aria-hidden="true"><span aria-hidden="true" class="octicon 
octicon-link"></span></a>Useful Tips</h1>
+<p>This tutorial guides you to create CarbonData Tables and optimize 
performance.
+The following sections will elaborate on the below topics :</p>
+<ul>
+<li><a href="#suggestions-to-create-carbondata-table">Suggestions to create 
CarbonData Table</a></li>
+<li><a 
href="#configuration-for-optimizing-data-loading-performance-for-massive-data">Configuration
 for Optimizing Data Loading performance for Massive Data</a></li>
+<li><a href="#configurations-for-optimizing-carbondata-performance">Optimizing 
Mass Data Loading</a></li>
+</ul>
+<h2>
+<a id="suggestions-to-create-carbondata-table" class="anchor" 
href="#suggestions-to-create-carbondata-table" aria-hidden="true"><span 
aria-hidden="true" class="octicon octicon-link"></span></a>Suggestions to 
Create CarbonData Table</h2>
+<p>For example, the results of the analysis for table creation with dimensions 
ranging from 10 thousand to 10 billion rows and 100 to 300 columns have been 
summarized below.
+The following table describes some of the columns from the table used.</p>
+<ul>
+<li><strong>Table Column Description</strong></li>
+</ul>
+<table>
+<thead>
+<tr>
+<th>Column Name</th>
+<th>Data Type</th>
+<th>Cardinality</th>
+<th>Attribution</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>msisdn</td>
+<td>String</td>
+<td>30 million</td>
+<td>Dimension</td>
+</tr>
+<tr>
+<td>BEGIN_TIME</td>
+<td>BigInt</td>
+<td>10 Thousand</td>
+<td>Dimension</td>
+</tr>
+<tr>
+<td>HOST</td>
+<td>String</td>
+<td>1 million</td>
+<td>Dimension</td>
+</tr>
+<tr>
+<td>Dime_1</td>
+<td>String</td>
+<td>1 Thousand</td>
+<td>Dimension</td>
+</tr>
+<tr>
+<td>counter_1</td>
+<td>Decimal</td>
+<td>NA</td>
+<td>Measure</td>
+</tr>
+<tr>
+<td>counter_2</td>
+<td>Numeric(20,0)</td>
+<td>NA</td>
+<td>Measure</td>
+</tr>
+<tr>
+<td>...</td>
+<td>...</td>
+<td>NA</td>
+<td>Measure</td>
+</tr>
+<tr>
+<td>counter_100</td>
+<td>Decimal</td>
+<td>NA</td>
+<td>Measure</td>
+</tr>
+</tbody>
+</table>
+<ul>
+<li><strong>Put the frequently-used column filter in the 
beginning</strong></li>
+</ul>
+<p>For example, MSISDN filter is used in most of the query then we must put 
the MSISDN in the first column.
+The create table command can be modified as suggested below :</p>
+<pre><code>create table carbondata_table(
+  msisdn String,
+  BEGIN_TIME bigint,
+  HOST String,
+  Dime_1 String,
+  counter_1, Decimal
+  ...
+  
+  )STORED BY 'carbondata'
+  TBLPROPERTIES ('SORT_COLUMNS'='msisdn, Dime_1')
+</code></pre>
+<p>Now the query with MSISDN in the filter will be more efficient.</p>
+<ul>
+<li><strong>Put the frequently-used columns in the order of low to high 
cardinality</strong></li>
+</ul>
+<p>If the table in the specified query has multiple columns which are 
frequently used to filter the results, it is suggested to put
+the columns in the order of cardinality low to high. This ordering of 
frequently used columns improves the compression ratio and
+enhances the performance of queries with filter on these columns.</p>
+<p>For example, if MSISDN, HOST and Dime_1 are frequently-used columns, then 
the column order of table is suggested as
+Dime_1&gt;HOST&gt;MSISDN, because Dime_1 has the lowest cardinality.
+The create table command can be modified as suggested below :</p>
+<pre><code>create table carbondata_table(
+    msisdn String,
+    BEGIN_TIME bigint,
+    HOST String,
+    Dime_1 String,
+    counter_1, Decimal
+    ...
+    
+    )STORED BY 'carbondata'
+    TBLPROPERTIES ('SORT_COLUMNS'='Dime_1, HOST, MSISDN')
+</code></pre>
+<ul>
+<li><strong>For measure type columns with non high accuracy, replace 
Numeric(20,0) data type with Double data type</strong></li>
+</ul>
+<p>For columns of measure type, not requiring high accuracy, it is suggested 
to replace Numeric data type with Double to enhance query performance.
+The create table command can be modified as below :</p>
+<pre><code>  create table carbondata_table(
+    Dime_1 String,
+    BEGIN_TIME bigint,
+    END_TIME bigint,
+    HOST String,
+    MSISDN String,
+    counter_1 decimal,
+    counter_2 double,
+    ...
+    )STORED BY 'carbondata'
+    TBLPROPERTIES ('SORT_COLUMNS'='Dime_1, HOST, MSISDN')
+</code></pre>
+<p>The result of performance analysis of test-case shows reduction in query 
execution time from 15 to 3 seconds, thereby improving performance by nearly 5 
times.</p>
+<ul>
+<li><strong>Columns of incremental character should be re-arranged at the end 
of dimensions</strong></li>
+</ul>
+<p>Consider the following scenario where data is loaded each day and the 
begin_time is incremental for each load, it is suggested to put begin_time at 
the end of dimensions.
+Incremental values are efficient in using min/max index. The create table 
command can be modified as below :</p>
+<pre><code>create table carbondata_table(
+  Dime_1 String,
+  HOST String,
+  MSISDN String,
+  counter_1 double,
+  counter_2 double,
+  BEGIN_TIME bigint,
+  END_TIME bigint,
+  ...
+  counter_100 double
+  )STORED BY 'carbondata'
+  TBLPROPERTIES ('SORT_COLUMNS'='Dime_1, HOST, MSISDN')
+</code></pre>
+<p><strong>NOTE:</strong></p>
+<ul>
+<li>BloomFilter can be created to enhance performance for queries with precise 
equal/in conditions. You can find more information about it in BloomFilter 
datamap <a 
href="https://github.com/apache/carbondata/blob/master/docs/datamap/bloomfilter-datamap-guide.html";
 target=_blank>document</a>.</li>
+</ul>
+<h2>
+<a id="configuration-for-optimizing-data-loading-performance-for-massive-data" 
class="anchor" 
href="#configuration-for-optimizing-data-loading-performance-for-massive-data" 
aria-hidden="true"><span aria-hidden="true" class="octicon 
octicon-link"></span></a>Configuration for Optimizing Data Loading performance 
for Massive Data</h2>
+<p>CarbonData supports large data load, in this process sorting data while 
loading consumes a lot of memory and disk IO and
+this can result sometimes in "Out Of Memory" exception.
+If you do not have much memory to use, then you may prefer to slow the speed 
of data loading instead of data load failure.
+You can configure CarbonData by tuning following properties in 
carbon.properties file to get a better performance.</p>
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Default Value</th>
+<th>Description/Tuning</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>carbon.number.of.cores.while.loading</td>
+<td>Default: 2.This value should be &gt;= 2</td>
+<td>Specifies the number of cores used for data processing during data loading 
in CarbonData.</td>
+</tr>
+<tr>
+<td>carbon.sort.size</td>
+<td>Default: 100000. The value should be &gt;= 100.</td>
+<td>Threshold to write local file in sort step when loading data</td>
+</tr>
+<tr>
+<td>carbon.sort.file.write.buffer.size</td>
+<td>Default:  50000.</td>
+<td>DataOutputStream buffer.</td>
+</tr>
+<tr>
+<td>carbon.number.of.cores.block.sort</td>
+<td>Default: 7</td>
+<td>If you have huge memory and CPUs, increase it as you will</td>
+</tr>
+<tr>
+<td>carbon.merge.sort.reader.thread</td>
+<td>Default: 3</td>
+<td>Specifies the number of cores used for temp file merging during data 
loading in CarbonData.</td>
+</tr>
+<tr>
+<td>carbon.merge.sort.prefetch</td>
+<td>Default: true</td>
+<td>You may want set this value to false if you have not enough memory</td>
+</tr>
+</tbody>
+</table>
+<p>For example, if there are 10 million records, and i have only 16 cores, 
64GB memory, will be loaded to CarbonData table.
+Using the default configuration  always fail in sort step. Modify 
carbon.properties as suggested below:</p>
+<pre><code>carbon.number.of.cores.block.sort=1
+carbon.merge.sort.reader.thread=1
+carbon.sort.size=5000
+carbon.sort.file.write.buffer.size=5000
+carbon.merge.sort.prefetch=false
+</code></pre>
+<h2>
+<a id="configurations-for-optimizing-carbondata-performance" class="anchor" 
href="#configurations-for-optimizing-carbondata-performance" 
aria-hidden="true"><span aria-hidden="true" class="octicon 
octicon-link"></span></a>Configurations for Optimizing CarbonData 
Performance</h2>
+<p>Recently we did some performance POC on CarbonData for Finance and 
telecommunication Field. It involved detailed queries and aggregation
+scenarios. After the completion of POC, some of the configurations impacting 
the performance have been identified and tabulated below :</p>
+<table>
+<thead>
+<tr>
+<th>Parameter</th>
+<th>Location</th>
+<th>Used For</th>
+<th>Description</th>
+<th>Tuning</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>carbon.sort.intermediate.files.limit</td>
+<td>spark/carbonlib/carbon.properties</td>
+<td>Data loading</td>
+<td>During the loading of data, local temp is used to sort the data. This 
number specifies the minimum number of intermediate files after which the  
merge sort has to be initiated.</td>
+<td>Increasing the parameter to a higher value will improve the load 
performance. For example, when we increase the value from 20 to 100, it 
increases the data load performance from 35MB/S to more than 50MB/S. Higher 
values of this parameter consumes  more memory during the load.</td>
+</tr>
+<tr>
+<td>carbon.number.of.cores.while.loading</td>
+<td>spark/carbonlib/carbon.properties</td>
+<td>Data loading</td>
+<td>Specifies the number of cores used for data processing during data loading 
in CarbonData.</td>
+<td>If you have more number of CPUs, then you can increase the number of CPUs, 
which will increase the performance. For example if we increase the value from 
2 to 4 then the CSV reading performance can increase about 1 times</td>
+</tr>
+<tr>
+<td>carbon.compaction.level.threshold</td>
+<td>spark/carbonlib/carbon.properties</td>
+<td>Data loading and Querying</td>
+<td>For minor compaction, specifies the number of segments to be merged in 
stage 1 and number of compacted segments to be merged in stage 2.</td>
+<td>Each CarbonData load will create one segment, if every load is small in 
size it will generate many small file over a period of time impacting the query 
performance. Configuring this parameter will merge the small segment to one big 
segment which will sort the data and improve the performance. For Example in 
one telecommunication scenario, the performance improves about 2 times after 
minor compaction.</td>
+</tr>
+<tr>
+<td>spark.sql.shuffle.partitions</td>
+<td>spark/conf/spark-defaults.conf</td>
+<td>Querying</td>
+<td>The number of task started when spark shuffle.</td>
+<td>The value can be 1 to 2 times as much as the executor cores. In an 
aggregation scenario, reducing the number from 200 to 32 reduced the query time 
from 17 to 9 seconds.</td>
+</tr>
+<tr>
+<td>spark.executor.instances/spark.executor.cores/spark.executor.memory</td>
+<td>spark/conf/spark-defaults.conf</td>
+<td>Querying</td>
+<td>The number of executors, CPU cores, and memory used for CarbonData 
query.</td>
+<td>In the bank scenario, we provide the 4 CPUs cores and 15 GB for each 
executor which can get good performance. This 2 value does not mean more the 
better. It needs to be configured properly in case of limited resources. For 
example, In the bank scenario, it has enough CPU 32 cores each node but less 
memory 64 GB each node. So we cannot give more CPU but less memory. For 
example, when 4 cores and 12GB for each executor. It sometimes happens GC 
during the query which impact the query performance very much from the 3 second 
to more than 15 seconds. In this scenario need to increase the memory or 
decrease the CPU cores.</td>
+</tr>
+<tr>
+<td>carbon.detail.batch.size</td>
+<td>spark/carbonlib/carbon.properties</td>
+<td>Data loading</td>
+<td>The buffer size to store records, returned from the block scan.</td>
+<td>In limit scenario this parameter is very important. For example your query 
limit is 1000. But if we set this value to 3000 that means we get 3000 records 
from scan but spark will only take 1000 rows. So the 2000 remaining are 
useless. In one Finance test case after we set it to 100, in the limit 1000 
scenario the performance increase about 2 times in comparison to if we set this 
value to 12000.</td>
+</tr>
+<tr>
+<td>carbon.use.local.dir</td>
+<td>spark/carbonlib/carbon.properties</td>
+<td>Data loading</td>
+<td>Whether use YARN local directories for multi-table load disk load 
balance</td>
+<td>If this is set it to true CarbonData will use YARN local directories for 
multi-table load disk load balance, that will improve the data load 
performance.</td>
+</tr>
+<tr>
+<td>carbon.use.multiple.temp.dir</td>
+<td>spark/carbonlib/carbon.properties</td>
+<td>Data loading</td>
+<td>Whether to use multiple YARN local directories during table data loading 
for disk load balance</td>
+<td>After enabling 'carbon.use.local.dir', if this is set to true, CarbonData 
will use all YARN local directories during data load for disk load balance, 
that will improve the data load performance. Please enable this property when 
you encounter disk hotspot problem during data loading.</td>
+</tr>
+<tr>
+<td>carbon.sort.temp.compressor</td>
+<td>spark/carbonlib/carbon.properties</td>
+<td>Data loading</td>
+<td>Specify the name of compressor to compress the intermediate sort temporary 
files during sort procedure in data loading.</td>
+<td>The optional values are 'SNAPPY','GZIP','BZIP2','LZ4','ZSTD' and empty. By 
default, empty means that Carbondata will not compress the sort temp files. 
This parameter will be useful if you encounter disk bottleneck.</td>
+</tr>
+<tr>
+<td>carbon.load.skewedDataOptimization.enabled</td>
+<td>spark/carbonlib/carbon.properties</td>
+<td>Data loading</td>
+<td>Whether to enable size based block allocation strategy for data 
loading.</td>
+<td>When loading, carbondata will use file size based block allocation 
strategy for task distribution. It will make sure that all the executors 
process the same size of data -- It's useful if the size of your input data 
files varies widely, say 1MB~1GB.</td>
+</tr>
+<tr>
+<td>carbon.load.min.size.enabled</td>
+<td>spark/carbonlib/carbon.properties</td>
+<td>Data loading</td>
+<td>Whether to enable node minumun input data size allocation strategy for 
data loading.</td>
+<td>When loading, carbondata will use node minumun input data size allocation 
strategy for task distribution. It will make sure the node load the minimum 
amount of data -- It's useful if the size of your input data files very small, 
say 1MB~256MB,Avoid generating a large number of small files.</td>
+</tr>
+</tbody>
+</table>
+<p>Note: If your CarbonData instance is provided only for query, you may 
specify the property 'spark.speculation=true' which is in conf directory of 
spark.</p>
+</div>
+</div>
+</div>
+</div>
+<div class="doc-footer">
+    <a href="#top" class="scroll-top">Top</a>
+</div>
+</div>
+</section>
+</div>
+</div>
+</div>
+</section><!-- End systemblock part -->
+<script src="js/custom.js"></script>
+</body>
+</html>
\ No newline at end of file
[17/20] carbondata-site git commit: Updated changes for 1.5.0 release

Reply via email to