http://git-wip-us.apache.org/repos/asf/carbondata-site/blob/a51dc596/content/dml-operation-on-carbondata.html ---------------------------------------------------------------------- diff --git a/content/dml-operation-on-carbondata.html b/content/dml-operation-on-carbondata.html deleted file mode 100644 index 655a61c..0000000 --- a/content/dml-operation-on-carbondata.html +++ /dev/null @@ -1,716 +0,0 @@ -<!DOCTYPE html> -<html lang="en"> -<head> - <meta charset="utf-8"> - <meta http-equiv="X-UA-Compatible" content="IE=edge"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <link href='images/favicon.ico' rel='shortcut icon' type='image/x-icon'> - <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags --> - <title>CarbonData</title> - <style> - - </style> - <!-- Bootstrap --> - - <link rel="stylesheet" href="css/bootstrap.min.css"> - <link href="css/style.css" rel="stylesheet"> - <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries --> - <!-- WARNING: Respond.js doesn't work if you view the page via file:// --> - <!--[if lt IE 9]> - <script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script> - <script src="https://oss.maxcdn.scom/respond/1.4.2/respond.min.js"></script> - <![endif]--> - <script src="js/jquery.min.js"></script> - <script src="js/bootstrap.min.js"></script> - - -</head> -<body> -<header> - <nav class="navbar navbar-default navbar-custom cd-navbar-wrapper"> - <div class="container"> - <div class="navbar-header"> - <button aria-controls="navbar" aria-expanded="false" data-target="#navbar" data-toggle="collapse" - class="navbar-toggle collapsed" type="button"> - <span class="sr-only">Toggle navigation</span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - </button> - <a href="index.html" class="logo"> - <img src="images/CarbonDataLogo.png" alt="CarbonData logo" title="CarbocnData logo"/> - </a> - </div> - <div class="navbar-collapse collapse cd_navcontnt" id="navbar"> - <ul class="nav navbar-nav navbar-right navlist-custom"> - <li><a href="index.html" class="hidden-xs"><i class="fa fa-home" aria-hidden="true"></i> </a> - </li> - <li><a href="index.html" class="hidden-lg hidden-md hidden-sm">Home</a></li> - <li class="dropdown"> - <a href="#" class="dropdown-toggle " data-toggle="dropdown" role="button" aria-haspopup="true" - aria-expanded="false"> Download <span class="caret"></span></a> - <ul class="dropdown-menu"> - <li> - <a href="https://dist.apache.org/repos/dist/release/carbondata/1.2.0/" - target="_blank">Apache CarbonData 1.2.0</a></li> - <li> - <a href="https://dist.apache.org/repos/dist/release/carbondata/1.1.1/" - target="_blank">Apache CarbonData 1.1.1</a></li> - <li> - <a href="https://dist.apache.org/repos/dist/release/carbondata/1.1.0/" - target="_blank">Apache CarbonData 1.1.0</a></li> - <li> - <a href="http://archive.apache.org/dist/incubator/carbondata/1.0.0-incubating/" - target="_blank">Apache CarbonData 1.0.0</a></li> - <li> - <a href="http://archive.apache.org/dist/incubator/carbondata/0.2.0-incubating/" - target="_blank">Apache CarbonData 0.2.0</a></li> - <li> - <a href="http://archive.apache.org/dist/incubator/carbondata/0.1.1-incubating/" - target="_blank">Apache CarbonData 0.1.1</a></li> - <li> - <a href="http://archive.apache.org/dist/incubator/carbondata/0.1.0-incubating/" - target="_blank">Apache CarbonData 0.1.0</a></li> - <li> - <a href="https://cwiki.apache.org/confluence/display/CARBONDATA/Releases" - target="_blank">Release Archive</a></li> - </ul> - </li> - <li><a href="mainpage.html" class="active">Documentation</a></li> - <li class="dropdown"> - <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" - aria-expanded="false">Community <span class="caret"></span></a> - <ul class="dropdown-menu"> - <li> - <a href="https://github.com/apache/carbondata/blob/master/docs/How-to-contribute-to-Apache-CarbonData.md" - target="_blank">Contributing to CarbonData</a></li> - <li> - <a href="https://github.com/apache/carbondata/blob/master/docs/release-guide.md" - target="_blank">Release Guide</a></li> - <li> - <a href="https://cwiki.apache.org/confluence/display/CARBONDATA/PMC+and+Committers+member+list" - target="_blank">Project PMC and Committers</a></li> - <li> - <a href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=66850609" - target="_blank">CarbonData Meetups</a></li> - <li><a href="security.html">Apache CarbonData Security</a></li> - <li><a href="https://issues.apache.org/jira/browse/CARBONDATA" target="_blank">Apache - Jira</a></li> - <li><a href="videogallery.html">CarbonData Videos </a></li> - </ul> - </li> - <li class="dropdown"> - <a href="http://www.apache.org/" class="apache_link hidden-xs dropdown-toggle" - data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Apache</a> - <ul class="dropdown-menu"> - <li><a href="http://www.apache.org/" target="_blank">Apache Homepage</a></li> - <li><a href="http://www.apache.org/licenses/" target="_blank">License</a></li> - <li><a href="http://www.apache.org/foundation/sponsorship.html" - target="_blank">Sponsorship</a></li> - <li><a href="http://www.apache.org/foundation/thanks.html" target="_blank">Thanks</a></li> - </ul> - </li> - - <li class="dropdown"> - <a href="http://www.apache.org/" class="hidden-lg hidden-md hidden-sm dropdown-toggle" - data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Apache</a> - <ul class="dropdown-menu"> - <li><a href="http://www.apache.org/" target="_blank">Apache Homepage</a></li> - <li><a href="http://www.apache.org/licenses/" target="_blank">License</a></li> - <li><a href="http://www.apache.org/foundation/sponsorship.html" - target="_blank">Sponsorship</a></li> - <li><a href="http://www.apache.org/foundation/thanks.html" target="_blank">Thanks</a></li> - </ul> - </li> - - <li> - <a href="#" id="search-icon"><i class="fa fa-search" aria-hidden="true"></i></a> - - </li> - - </ul> - </div><!--/.nav-collapse --> - <div id="search-box"> - <form method="get" action="http://www.google.com/search" target="_blank"> - <div class="search-block"> - <table border="0" cellpadding="0" width="100%"> - <tr> - <td style="width:80%"> - <input type="text" name="q" size=" 5" maxlength="255" value="" - class="search-input" placeholder="Search...." required/> - </td> - <td style="width:20%"> - <input type="submit" value="Search"/></td> - </tr> - <tr> - <td align="left" style="font-size:75%" colspan="2"> - <input type="checkbox" name="sitesearch" value="carbondata.apache.org" checked/> - <span style=" position: relative; top: -3px;"> Only search for CarbonData</span> - </td> - </tr> - </table> - </div> - </form> - </div> - </div> - </nav> -</header> <!-- end Header part --> - -<div class="fixed-padding"></div> <!-- top padding with fixde header --> - -<section><!-- Dashboard nav --> - <div class="container-fluid q"> - <div class="col-sm-12 col-md-12 maindashboard"> - <div class="row"> - <section> - <div style="padding:10px 15px;"> - <div id="viewpage" name="viewpage"> - <div class="row"> - <div class="col-sm-12 col-md-12"> - <div> -<h1> -<a id="dml-operations-on-carbondata" class="anchor" href="#dml-operations-on-carbondata" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>DML Operations on CarbonData</h1> -<p>This tutorial guides you through the data manipulation language support provided by CarbonData.</p> -<h2> -<a id="overview" class="anchor" href="#overview" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Overview</h2> -<p>The following DML operations are supported in CarbonData :</p> -<ul> -<li><a href="#load-data">LOAD DATA</a></li> -<li><a href="#insert-data-into-a-carbondata-table">INSERT DATA INTO A CARBONDATA TABLE</a></li> -<li><a href="#show-segments">SHOW SEGMENTS</a></li> -<li><a href="#delete-segment-by-id">DELETE SEGMENT BY ID</a></li> -<li><a href="#delete-segment-by-date">DELETE SEGMENT BY DATE</a></li> -<li><a href="#update-carbondata-table">UPDATE CARBONDATA TABLE</a></li> -<li><a href="#delete-records-from-carbondata-table">DELETE RECORDS FROM CARBONDATA TABLE</a></li> -</ul> -<h2> -<a id="load-data" class="anchor" href="#load-data" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>LOAD DATA</h2> -<p>This command loads the user data in raw format to the CarbonData specific data format store, this allows CarbonData to provide good performance while querying the data. -Please visit <a href="data-management.html">Data Management</a> for more details on LOAD.</p> -<h3> -<a id="syntax" class="anchor" href="#syntax" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Syntax</h3> -<pre><code>LOAD DATA [LOCAL] INPATH 'folder_path' -INTO TABLE [db_name.]table_name -OPTIONS(property_name=property_value, ...) -</code></pre> -<p>OPTIONS are not mandatory for data loading process. Inside OPTIONS user can provide either of any options like DELIMITER, QUOTECHAR, ESCAPECHAR, MULTILINE as per requirement.</p> -<p>NOTE: The path shall be canonical path.</p> -<h3> -<a id="parameter-description" class="anchor" href="#parameter-description" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Parameter Description</h3> -<table> -<thead> -<tr> -<th>Parameter</th> -<th>Description</th> -<th>Optional</th> -</tr> -</thead> -<tbody> -<tr> -<td>folder_path</td> -<td>Path of raw csv data folder or file.</td> -<td>NO</td> -</tr> -<tr> -<td>db_name</td> -<td>Database name, if it is not specified then it uses the current database.</td> -<td>YES</td> -</tr> -<tr> -<td>table_name</td> -<td>The name of the table in provided database.</td> -<td>NO</td> -</tr> -<tr> -<td>OPTIONS</td> -<td>Extra options provided to Load</td> -<td>YES</td> -</tr> -</tbody> -</table> -<h3> -<a id="usage-guidelines" class="anchor" href="#usage-guidelines" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Usage Guidelines</h3> -<p>You can use the following options to load data:</p> -<ul> -<li> -<p><strong>DELIMITER:</strong> Delimiters can be provided in the load command.</p> -<pre><code>OPTIONS('DELIMITER'=',') -</code></pre> -</li> -<li> -<p><strong>QUOTECHAR:</strong> Quote Characters can be provided in the load command.</p> -<pre><code>OPTIONS('QUOTECHAR'='"') -</code></pre> -</li> -<li> -<p><strong>COMMENTCHAR:</strong> Comment Characters can be provided in the load command if user want to comment lines.</p> -<pre><code>OPTIONS('COMMENTCHAR'='#') -</code></pre> -</li> -<li> -<p><strong>FILEHEADER:</strong> Headers can be provided in the LOAD DATA command if headers are missing in the source files.</p> -<pre><code>OPTIONS('FILEHEADER'='column1,column2') -</code></pre> -</li> -<li> -<p><strong>MULTILINE:</strong> CSV with new line character in quotes.</p> -<pre><code>OPTIONS('MULTILINE'='true') -</code></pre> -</li> -<li> -<p><strong>ESCAPECHAR:</strong> Escape char can be provided if user want strict validation of escape character on CSV.</p> -<pre><code>OPTIONS('ESCAPECHAR'='\') -</code></pre> -</li> -<li> -<p><strong>COMPLEX_DELIMITER_LEVEL_1:</strong> Split the complex type data column in a row (eg., a$b$c --> Array = {a,b,c}).</p> -<pre><code>OPTIONS('COMPLEX_DELIMITER_LEVEL_1'='$') -</code></pre> -</li> -<li> -<p><strong>COMPLEX_DELIMITER_LEVEL_2:</strong> Split the complex type nested data column in a row. Applies level_1 delimiter & applies level_2 based on complex data type (eg., a:b$c:d --> Array> = {{a,b},{c,d}}).</p> -<pre><code>OPTIONS('COMPLEX_DELIMITER_LEVEL_2'=':') -</code></pre> -</li> -<li> -<p><strong>ALL_DICTIONARY_PATH:</strong> All dictionary files path.</p> -<pre><code>OPTIONS('ALL_DICTIONARY_PATH'='/opt/alldictionary/data.dictionary') -</code></pre> -</li> -<li> -<p><strong>COLUMNDICT:</strong> Dictionary file path for specified column.</p> -<pre><code>OPTIONS('COLUMNDICT'='column1:dictionaryFilePath1, -column2:dictionaryFilePath2') -</code></pre> -<p>NOTE: ALL_DICTIONARY_PATH and COLUMNDICT can't be used together.</p> -</li> -<li> -<p><strong>DATEFORMAT:</strong> Date format for specified column.</p> -<pre><code>OPTIONS('DATEFORMAT'='column1:dateFormat1, column2:dateFormat2') -</code></pre> -<p>NOTE: Date formats are specified by date pattern strings. The date pattern letters in CarbonData are same as in JAVA. Refer to <a href="http://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html" target=_blank>SimpleDateFormat</a>.</p> -</li> -<li> -<p><strong>SINGLE_PASS:</strong> Single Pass Loading enables single job to finish data loading with dictionary generation on the fly. It enhances performance in the scenarios where the subsequent data loading after initial load involves fewer incremental updates on the dictionary.</p> -<p>This option specifies whether to use single pass for loading data or not. By default this option is set to FALSE.</p> -<pre><code>OPTIONS('SINGLE_PASS'='TRUE') -</code></pre> -<p>Note :</p> -<ul> -<li> -<p>If this option is set to TRUE then data loading will take less time.</p> -</li> -<li> -<p>If this option is set to some invalid value other than TRUE or FALSE then it uses the default value.</p> -</li> -<li> -<p>If this option is set to TRUE, then high.cardinality.identify.enable property will be disabled during data load.</p> -</li> -</ul> -<h3> -<a id="example" class="anchor" href="#example" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Example:</h3> -</li> -</ul> -<pre><code>LOAD DATA local inpath '/opt/rawdata/data.csv' INTO table carbontable -options('DELIMITER'=',', 'QUOTECHAR'='"','COMMENTCHAR'='#', -'FILEHEADER'='empno,empname,designation,doj,workgroupcategory, - workgroupcategoryname,deptno,deptname,projectcode, - projectjoindate,projectenddate,attendance,utilization,salary', -'MULTILINE'='true','ESCAPECHAR'='\','COMPLEX_DELIMITER_LEVEL_1'='$', -'COMPLEX_DELIMITER_LEVEL_2'=':', -'ALL_DICTIONARY_PATH'='/opt/alldictionary/data.dictionary', -'SINGLE_PASS'='TRUE' -) -</code></pre> -<ul> -<li> -<p><strong>BAD RECORDS HANDLING:</strong> Methods of handling bad records are as follows:</p> -<ul> -<li> -<p>Load all of the data before dealing with the errors.</p> -</li> -<li> -<p>Clean or delete bad records before loading data or stop the loading when bad records are found.</p> -</li> -</ul> -<pre><code>OPTIONS('BAD_RECORDS_LOGGER_ENABLE'='true', 'BAD_RECORD_PATH'='hdfs://hacluster/tmp/carbon', 'BAD_RECORDS_ACTION'='REDIRECT', 'IS_EMPTY_DATA_BAD_RECORD'='false') -</code></pre> -<p>NOTE:</p> -<ul> -<li> -<p>If the REDIRECT option is used, Carbon will add all bad records in to a separate CSV file. However, this file must not be used for subsequent data loading because the content may not exactly match the source record. You are advised to cleanse the original source record for further data ingestion. This option is used to remind you which records are bad records.</p> -</li> -<li> -<p>In loaded data, if all records are bad records, the BAD_RECORDS_ACTION is invalid and the load operation fails.</p> -</li> -<li> -<p>The maximum number of characters per column is 100000. If there are more than 100000 characters in a column, data loading will fail.</p> -</li> -</ul> -</li> -</ul> -<h3> -<a id="example-1" class="anchor" href="#example-1" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Example:</h3> -<pre><code>LOAD DATA INPATH 'filepath.csv' -INTO TABLE tablename -OPTIONS('BAD_RECORDS_LOGGER_ENABLE'='true', -'BAD_RECORD_PATH'='hdfs://hacluster/tmp/carbon', -'BAD_RECORDS_ACTION'='REDIRECT', -'IS_EMPTY_DATA_BAD_RECORD'='false'); -</code></pre> -<p><strong>Bad Records Management Options:</strong></p> -<table> -<thead> -<tr> -<th>Options</th> -<th>Default Value</th> -<th>Description</th> -</tr> -</thead> -<tbody> -<tr> -<td>BAD_RECORDS_LOGGER_ENABLE</td> -<td>false</td> -<td>Whether to create logs with details about bad records.</td> -</tr> -<tr> -<td>BAD_RECORDS_ACTION</td> -<td>FAIL</td> -<td>Following are the four types of action for bad records: FORCE: Auto-corrects the data by storing the bad records as NULL. REDIRECT: Bad records are written to the raw CSV instead of being loaded. IGNORE: Bad records are neither loaded nor written to the raw CSV. FAIL: Data loading fails if any bad records are found. NOTE: In loaded data, if all records are bad records, the BAD_RECORDS_ACTION is invalid and the load operation fails.</td> -</tr> -<tr> -<td>IS_EMPTY_DATA_BAD_RECORD</td> -<td>false</td> -<td>If false, then empty ("" or '' or ,,) data will not be considered as bad record and vice versa.</td> -</tr> -<tr> -<td>BAD_RECORD_PATH</td> -<td>-</td> -<td>Specifies the HDFS path where bad records are stored. By default the value is Null. This path must to be configured by the user if bad record logger is enabled or bad record action redirect.</td> -</tr> -</tbody> -</table> -<h2> -<a id="insert-data-into-a-carbondata-table" class="anchor" href="#insert-data-into-a-carbondata-table" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>INSERT DATA INTO A CARBONDATA TABLE</h2> -<p>This command inserts data into a CarbonData table. It is defined as a combination of two queries Insert and Select query respectively. It inserts records from a source table into a target CarbonData table. The source table can be a Hive table, Parquet table or a CarbonData table itself. It comes with the functionality to aggregate the records of a table by performing Select query on source table and load its corresponding resultant records into a CarbonData table.</p> -<p><strong>NOTE</strong> : The client node where the INSERT command is executing, must be part of the cluster.</p> -<h3> -<a id="syntax-1" class="anchor" href="#syntax-1" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Syntax</h3> -<pre><code>INSERT INTO TABLE <CARBONDATA TABLE> SELECT * FROM sourceTableName -[ WHERE { <filter_condition> } ]; -</code></pre> -<p>You can also omit the <code>table</code> keyword and write your query as:</p> -<pre><code>INSERT INTO <CARBONDATA TABLE> SELECT * FROM sourceTableName -[ WHERE { <filter_condition> } ]; -</code></pre> -<h3> -<a id="parameter-description-1" class="anchor" href="#parameter-description-1" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Parameter Description</h3> -<table> -<thead> -<tr> -<th>Parameter</th> -<th>Description</th> -</tr> -</thead> -<tbody> -<tr> -<td>CARBON TABLE</td> -<td>The name of the Carbon table in which you want to perform the insert operation.</td> -</tr> -<tr> -<td>sourceTableName</td> -<td>The table from which the records are read and inserted into destination CarbonData table.</td> -</tr> -</tbody> -</table> -<h3> -<a id="usage-guidelines-1" class="anchor" href="#usage-guidelines-1" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Usage Guidelines</h3> -<p>The following condition must be met for successful insert operation :</p> -<ul> -<li>The source table and the CarbonData table must have the same table schema.</li> -<li>The table must be created.</li> -<li>Overwrite is not supported for CarbonData table.</li> -<li>The data type of source and destination table columns should be same, else the data from source table will be treated as bad records and the INSERT command fails.</li> -<li>INSERT INTO command does not support partial success if bad records are found, it will fail.</li> -<li>Data cannot be loaded or updated in source table while insert from source table to target table is in progress.</li> -</ul> -<p>To enable data load or update during insert operation, configure the following property to true.</p> -<pre><code>carbon.insert.persist.enable=true -</code></pre> -<p>By default the above configuration will be false.</p> -<p><strong>NOTE</strong>: Enabling this property will reduce the performance.</p> -<h3> -<a id="examples" class="anchor" href="#examples" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Examples</h3> -<pre><code>INSERT INTO table1 SELECT item1 ,sum(item2 + 1000) as result FROM -table2 group by item1; -</code></pre> -<pre><code>INSERT INTO table1 SELECT item1, item2, item3 FROM table2 -where item2='xyz'; -</code></pre> -<pre><code>INSERT INTO table1 SELECT * FROM table2 -where exists (select * from table3 -where table2.item1 = table3.item1); -</code></pre> -<p><strong>The Status Success/Failure shall be captured in the driver log.</strong></p> -<h2> -<a id="show-segments" class="anchor" href="#show-segments" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>SHOW SEGMENTS</h2> -<p>This command is used to get the segments of CarbonData table.</p> -<pre><code>SHOW SEGMENTS FOR TABLE [db_name.]table_name -LIMIT number_of_segments; -</code></pre> -<h3> -<a id="parameter-description-2" class="anchor" href="#parameter-description-2" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Parameter Description</h3> -<table> -<thead> -<tr> -<th>Parameter</th> -<th>Description</th> -<th>Optional</th> -</tr> -</thead> -<tbody> -<tr> -<td>db_name</td> -<td>Database name, if it is not specified then it uses the current database.</td> -<td>YES</td> -</tr> -<tr> -<td>table_name</td> -<td>The name of the table in provided database.</td> -<td>NO</td> -</tr> -<tr> -<td>number_of_segments</td> -<td>Limit the output to this number.</td> -<td>YES</td> -</tr> -</tbody> -</table> -<h3> -<a id="example-2" class="anchor" href="#example-2" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Example:</h3> -<pre><code>SHOW SEGMENTS FOR TABLE CarbonDatabase.CarbonTable LIMIT 4; -</code></pre> -<h2> -<a id="delete-segment-by-id" class="anchor" href="#delete-segment-by-id" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>DELETE SEGMENT BY ID</h2> -<p>This command is used to delete segment by using the segment ID. Each segment has a unique segment ID associated with it. -Using this segment ID, you can remove the segment.</p> -<p>The following command will get the segmentID.</p> -<pre><code>SHOW SEGMENTS FOR Table [db_name.]table_name LIMIT number_of_segments -</code></pre> -<p>After you retrieve the segment ID of the segment that you want to delete, execute the following command to delete the selected segment.</p> -<pre><code>DELETE FROM TABLE [db_name.]table_name WHERE SEGMENT.ID IN (segment_id1, segments_id2, ...) -</code></pre> -<h3> -<a id="parameter-description-3" class="anchor" href="#parameter-description-3" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Parameter Description</h3> -<table> -<thead> -<tr> -<th>Parameter</th> -<th>Description</th> -<th>Optional</th> -</tr> -</thead> -<tbody> -<tr> -<td>segment_id</td> -<td>Segment Id of the load.</td> -<td>NO</td> -</tr> -<tr> -<td>db_name</td> -<td>Database name, if it is not specified then it uses the current database.</td> -<td>YES</td> -</tr> -<tr> -<td>table_name</td> -<td>The name of the table in provided database.</td> -<td>NO</td> -</tr> -</tbody> -</table> -<h3> -<a id="example-3" class="anchor" href="#example-3" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Example:</h3> -<pre><code>DELETE FROM TABLE CarbonDatabase.CarbonTable WHERE SEGMENT.ID IN (0); -DELETE FROM TABLE CarbonDatabase.CarbonTable WHERE SEGMENT.ID IN (0,5,8); -</code></pre> -<p>NOTE: Here 0.1 is compacted segment sequence id.</p> -<h2> -<a id="delete-segment-by-date" class="anchor" href="#delete-segment-by-date" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>DELETE SEGMENT BY DATE</h2> -<p>This command will allow to delete the CarbonData segment(s) from the store based on the date provided by the user in the DML command. -The segment created before the particular date will be removed from the specific stores.</p> -<pre><code>DELETE FROM TABLE [db_name.]table_name -WHERE SEGMENT.STARTTIME BEFORE DATE_VALUE -</code></pre> -<h3> -<a id="parameter-description-4" class="anchor" href="#parameter-description-4" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Parameter Description</h3> -<table> -<thead> -<tr> -<th>Parameter</th> -<th>Description</th> -<th>Optional</th> -</tr> -</thead> -<tbody> -<tr> -<td>DATE_VALUE</td> -<td>Valid segment load start time value. All the segments before this specified date will be deleted.</td> -<td>NO</td> -</tr> -<tr> -<td>db_name</td> -<td>Database name, if it is not specified then it uses the current database.</td> -<td>YES</td> -</tr> -<tr> -<td>table_name</td> -<td>The name of the table in provided database.</td> -<td>NO</td> -</tr> -</tbody> -</table> -<h3> -<a id="example-4" class="anchor" href="#example-4" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Example:</h3> -<pre><code> DELETE FROM TABLE CarbonDatabase.CarbonTable - WHERE SEGMENT.STARTTIME BEFORE '2017-06-01 12:05:06'; -</code></pre> -<h2> -<a id="update-carbondata-table" class="anchor" href="#update-carbondata-table" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Update CarbonData Table</h2> -<p>This command will allow to update the carbon table based on the column expression and optional filter conditions.</p> -<h3> -<a id="syntax-2" class="anchor" href="#syntax-2" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Syntax</h3> -<pre><code> UPDATE <table_name> - SET (column_name1, column_name2, ... column_name n) = - (column1_expression , column2_expression, ... column n_expression ) - [ WHERE { <filter_condition> } ]; -</code></pre> -<p>alternatively the following the command can also be used for updating the CarbonData Table :</p> -<pre><code>UPDATE <table_name> -SET (column_name1, column_name2) = -(select sourceColumn1, sourceColumn2 from sourceTable -[ WHERE { <filter_condition> } ] ) -[ WHERE { <filter_condition> } ]; -</code></pre> -<h3> -<a id="parameter-description-5" class="anchor" href="#parameter-description-5" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Parameter Description</h3> -<table> -<thead> -<tr> -<th>Parameter</th> -<th>Description</th> -</tr> -</thead> -<tbody> -<tr> -<td>table_name</td> -<td>The name of the Carbon table in which you want to perform the update operation.</td> -</tr> -<tr> -<td>column_name</td> -<td>The destination columns to be updated.</td> -</tr> -<tr> -<td>sourceColumn</td> -<td>The source table column values to be updated in destination table.</td> -</tr> -<tr> -<td>sourceTable</td> -<td>The table from which the records are updated into destination Carbon table.</td> -</tr> -</tbody> -</table> -<p>NOTE: This functionality is currently not supported in Spark 2.x and will support soon.</p> -<h3> -<a id="usage-guidelines-2" class="anchor" href="#usage-guidelines-2" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Usage Guidelines</h3> -<p>The following conditions must be met for successful updation :</p> -<ul> -<li>The update command fails if multiple input rows in source table are matched with single row in destination table.</li> -<li>If the source table generates empty records, the update operation will complete successfully without updating the table.</li> -<li>If a source table row does not correspond to any of the existing rows in a destination table, the update operation will complete successfully without updating the table.</li> -<li>In sub-query, if the source table and the target table are same, then the update operation fails.</li> -<li>If the sub-query used in UPDATE statement contains aggregate method or group by query, then the UPDATE operation fails.</li> -</ul> -<h3> -<a id="examples-1" class="anchor" href="#examples-1" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Examples</h3> -<p>Update is not supported for queries that contain aggregate or group by.</p> -<pre><code> UPDATE t_carbn01 a - SET (a.item_type_code, a.profit) = ( SELECT b.item_type_cd, - sum(b.profit) from t_carbn01b b - WHERE item_type_cd =2 group by item_type_code); -</code></pre> -<p>Here the Update Operation fails as the query contains aggregate function sum(b.profit) and group by clause in the sub-query.</p> -<pre><code>UPDATE carbonTable1 d -SET(d.column3,d.column5 ) = (SELECT s.c33 ,s.c55 -FROM sourceTable1 s WHERE d.column1 = s.c11) -WHERE d.column1 = 'china' EXISTS( SELECT * from table3 o where o.c2 > 1); -</code></pre> -<pre><code>UPDATE carbonTable1 d SET (c3) = (SELECT s.c33 from sourceTable1 s -WHERE d.column1 = s.c11) -WHERE exists( select * from iud.other o where o.c2 > 1); -</code></pre> -<pre><code>UPDATE carbonTable1 SET (c2, c5 ) = (c2 + 1, concat(c5 , "y" )); -</code></pre> -<pre><code>UPDATE carbonTable1 d SET (c2, c5 ) = (c2 + 1, "xyx") -WHERE d.column1 = 'india'; -</code></pre> -<pre><code>UPDATE carbonTable1 d SET (c2, c5 ) = (c2 + 1, "xyx") -WHERE d.column1 = 'india' -and EXISTS( SELECT * FROM table3 o WHERE o.column2 > 1); -</code></pre> -<p><strong>The Status Success/Failure shall be captured in the driver log and the client.</strong></p> -<h2> -<a id="delete-records-from-carbondata-table" class="anchor" href="#delete-records-from-carbondata-table" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Delete Records from CarbonData Table</h2> -<p>This command allows us to delete records from CarbonData table.</p> -<h3> -<a id="syntax-3" class="anchor" href="#syntax-3" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Syntax</h3> -<pre><code>DELETE FROM table_name [WHERE expression]; -</code></pre> -<h3> -<a id="parameter-description-6" class="anchor" href="#parameter-description-6" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Parameter Description</h3> -<table> -<thead> -<tr> -<th>Parameter</th> -<th>Description</th> -</tr> -</thead> -<tbody> -<tr> -<td>table_name</td> -<td>The name of the Carbon table in which you want to perform the delete.</td> -</tr> -</tbody> -</table> -<p>NOTE: This functionality is currently not supported in Spark 2.x and will support soon.</p> -<h3> -<a id="examples-2" class="anchor" href="#examples-2" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Examples</h3> -<pre><code>DELETE FROM columncarbonTable1 d WHERE d.column1 = 'china'; -</code></pre> -<pre><code>DELETE FROM dest WHERE column1 IN ('china', 'USA'); -</code></pre> -<pre><code>DELETE FROM columncarbonTable1 -WHERE column1 IN (SELECT column11 FROM sourceTable2); -</code></pre> -<pre><code>DELETE FROM columncarbonTable1 -WHERE column1 IN (SELECT column11 FROM sourceTable2 WHERE -column1 = 'USA'); -</code></pre> -<pre><code>DELETE FROM columncarbonTable1 WHERE column2 >= 4; -</code></pre> -<p><strong>The Status Success/Failure shall be captured in the driver log and the client.</strong></p> -</div> -</div> -</div> -</div> -<div class="doc-footer"> - <a href="#top" class="scroll-top">Top</a> -</div> -</div> -</section> -</div> -</div> -</div> -</section><!-- End systemblock part --> -<script src="js/custom.js"></script> -</body> -</html>
http://git-wip-us.apache.org/repos/asf/carbondata-site/blob/a51dc596/content/documentation.html ---------------------------------------------------------------------- diff --git a/content/documentation.html b/content/documentation.html index 634da94..982becf 100644 --- a/content/documentation.html +++ b/content/documentation.html @@ -159,7 +159,7 @@ <div class="nav__inner"> <a class="b-nav__intro nav__item" href="./introduction.html">introduction</a> <a class="b-nav__quickstart nav__item" href="./quick-start-guide.html">quick start</a> - <a class="b-nav__uses nav__item" href="./usescases.html">use cases</a> + <a class="b-nav__uses nav__item" href="./usecases.html">use cases</a> <div class="nav__item nav__item__with__subs"> <a class="b-nav__docs nav__item nav__sub__anchor" href="./language-manual.html">Language Reference</a> @@ -179,9 +179,9 @@ <a class="nav__item nav__sub__item" href="./timeseries-datamap-guide.html">Time Series</a> </div> - <a class="b-nav__s3 nav__item" href="./s3-guide.html">S3 Support</a> <a class="b-nav__api nav__item" href="./sdk-guide.html">API</a> <a class="b-nav__perf nav__item" href="./performance-tuning.html">Performance Tuning</a> + <a class="b-nav__s3 nav__item" href="./s3-guide.html">S3 Storage</a> <a class="b-nav__faq nav__item" href="./faq.html">FAQ</a> <a class="b-nav__contri nav__item" href="./how-to-contribute-to-apache-carbondata.html">Contribute</a> <a class="b-nav__security nav__item" href="./security.html">Security</a> @@ -194,9 +194,9 @@ <div class="b-nav__uses navindicator__item"></div> <div class="b-nav__docs navindicator__item"></div> <div class="b-nav__datamap navindicator__item"></div> - <div class="b-nav__s3 navindicator__item"></div> <div class="b-nav__api navindicator__item"></div> <div class="b-nav__perf navindicator__item"></div> + <div class="b-nav__s3 navindicator__item"></div> <div class="b-nav__faq navindicator__item"></div> <div class="b-nav__contri navindicator__item"></div> <div class="b-nav__security navindicator__item"></div> @@ -215,13 +215,13 @@ <p>Apache CarbonData is a new big data file format for faster interactive query using advanced columnar storage, index, compression and encoding techniques to improve computing efficiency, which helps in speeding up queries by an order of magnitude faster over PetaBytes of data.</p> <h2> <a id="getting-started" class="anchor" href="#getting-started" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Getting Started</h2> -<p><strong>File Format Concepts:</strong> Start with the basics of understanding the <a href="./file-structure-of-carbondata.html#carbondata-file-structure">CarbonData file format</a> and its storage structure.This will help to understand other parts of the documentation, incuding deployment, programming and usage guides.</p> +<p><strong>File Format Concepts:</strong> Start with the basics of understanding the <a href="./file-structure-of-carbondata.html#carbondata-file-format">CarbonData file format</a> and its <a href="./file-structure-of-carbondata.html">storage structure</a>.This will help to understand other parts of the documentation, including deployment, programming and usage guides.</p> <p><strong>Quick Start:</strong> <a href="./quick-start-guide.html#installing-and-configuring-carbondata-to-run-locally-with-spark-shell">Run an example program</a> on your local machine or <a href="https://github.com/apache/carbondata/tree/master/examples/spark2/src/main/scala/org/apache/carbondata/examples" target=_blank>study some examples</a>.</p> <p><strong>CarbonData SQL Language Reference:</strong> CarbonData extends the Spark SQL language and adds several <a href="./ddl-of-carbondata.html">DDL</a> and <a href="./dml-of-carbondata.html">DML</a> statements to support operations on it.Refer to the <a href="./language-manual.html">Reference Manual</a> to understand the supported features and functions.</p> <p><strong>Programming Guides:</strong> You can read our guides about <a href="./sdk-guide.html">APIs supported</a> to learn how to integrate CarbonData with your applications.</p> <h2> -<a id="deployment" class="anchor" href="#deployment" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Deployment</h2> -<p>CarbonData can be integrated with popular Execution engines like <a href="./quick-start-guide.html#spark">Spark</a> and <a href="./quick-start-guide.html#presto">Presto</a>.Refer to the <a href="./quick-start-guide.html##deployment-modes">Installation and Configuration</a> section to understand all modes of Integrating CarbonData.</p> +<a id="integration" class="anchor" href="#integration" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Integration</h2> +<p>CarbonData can be integrated with popular Execution engines like <a href="./quick-start-guide.html#spark">Spark</a> and <a href="./quick-start-guide.html#presto">Presto</a>.Refer to the <a href="./quick-start-guide.html#integration">Installation and Configuration</a> section to understand all modes of Integrating CarbonData.</p> <h2> <a id="contributing-to-carbondata" class="anchor" href="#contributing-to-carbondata" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Contributing to CarbonData</h2> <p>The Apache CarbonData community welcomes all kinds of contributions from anyone with a passion for @@ -234,10 +234,6 @@ faster data format.Contributing to CarbonData doesn?t just mean writing code. He <p><strong>Blogs:</strong> Blogs by external users can be found <a href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=67635497" rel="nofollow">here</a>.</p> <p><strong>Performance reports:</strong> TPC-H performance reports can be found <a href="https://cwiki.apache.org/confluence/display/CARBONDATA/Performance+-+TPCH+Report+of+CarbonData+%281.2+version%29+and+Parquet+on+Spark+Execution+Engine" rel="nofollow">here</a>.</p> <p><strong>Trainings:</strong> Training records on design and code flows can be found <a href="https://cwiki.apache.org/confluence/display/CARBONDATA/CarbonData+Training+Materials" rel="nofollow">here</a>.</p> -<script> -// Show selected style on nav item -$(function() { $('.b-nav__intro').addClass('selected'); }); -</script> </div> </div> </div> @@ -253,4 +249,4 @@ $(function() { $('.b-nav__intro').addClass('selected'); }); </section><!-- End systemblock part --> <script src="js/custom.js"></script> </body> -</html> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/carbondata-site/blob/a51dc596/content/errorpage.html ---------------------------------------------------------------------- diff --git a/content/errorpage.html b/content/errorpage.html index 35cc01a..090dce5 100644 --- a/content/errorpage.html +++ b/content/errorpage.html @@ -89,4 +89,4 @@ </body> -</html> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/carbondata-site/blob/a51dc596/content/faq.html ---------------------------------------------------------------------- diff --git a/content/faq.html b/content/faq.html index 52112b7..c37284f 100644 --- a/content/faq.html +++ b/content/faq.html @@ -159,7 +159,7 @@ <div class="nav__inner"> <a class="b-nav__intro nav__item" href="./introduction.html">introduction</a> <a class="b-nav__quickstart nav__item" href="./quick-start-guide.html">quick start</a> - <a class="b-nav__uses nav__item" href="./usescases.html">use cases</a> + <a class="b-nav__uses nav__item" href="./usecases.html">use cases</a> <div class="nav__item nav__item__with__subs"> <a class="b-nav__docs nav__item nav__sub__anchor" href="./language-manual.html">Language Reference</a> @@ -179,9 +179,9 @@ <a class="nav__item nav__sub__item" href="./timeseries-datamap-guide.html">Time Series</a> </div> - <a class="b-nav__s3 nav__item" href="./s3-guide.html">S3 Support</a> <a class="b-nav__api nav__item" href="./sdk-guide.html">API</a> <a class="b-nav__perf nav__item" href="./performance-tuning.html">Performance Tuning</a> + <a class="b-nav__s3 nav__item" href="./s3-guide.html">S3 Storage</a> <a class="b-nav__faq nav__item" href="./faq.html">FAQ</a> <a class="b-nav__contri nav__item" href="./how-to-contribute-to-apache-carbondata.html">Contribute</a> <a class="b-nav__security nav__item" href="./security.html">Security</a> @@ -194,9 +194,9 @@ <div class="b-nav__uses navindicator__item"></div> <div class="b-nav__docs navindicator__item"></div> <div class="b-nav__datamap navindicator__item"></div> - <div class="b-nav__s3 navindicator__item"></div> <div class="b-nav__api navindicator__item"></div> <div class="b-nav__perf navindicator__item"></div> + <div class="b-nav__s3 navindicator__item"></div> <div class="b-nav__faq navindicator__item"></div> <div class="b-nav__contri navindicator__item"></div> <div class="b-nav__security navindicator__item"></div> @@ -222,19 +222,19 @@ <li><a href="#how-to-resolve-abstract-method-error">How to resolve Abstract Method Error?</a></li> <li><a href="#how-carbon-will-behave-when-execute-insert-operation-in-abnormal-scenarios">How Carbon will behave when execute insert operation in abnormal scenarios?</a></li> <li><a href="#why-aggregate-query-is-not-fetching-data-from-aggregate-table">Why aggregate query is not fetching data from aggregate table?</a></li> -<li><a href="#Why-all-executors-are-showing-success-in-Spark-UI-even-after-Dataload-command-failed-at-driver-side">Why all executors are showing success in Spark UI even after Dataload command failed at Driver side?</a></li> -<li><a href="#Why-different-time-zone-result-for-select-query-output-when-query-SDK-writer-output">Why different time zone result for select query output when query SDK writer output?</a></li> +<li><a href="#why-all-executors-are-showing-success-in-spark-ui-even-after-dataload-command-failed-at-driver-side">Why all executors are showing success in Spark UI even after Dataload command failed at Driver side?</a></li> +<li><a href="#why-different-time-zone-result-for-select-query-output-when-query-sdk-writer-output">Why different time zone result for select query output when query SDK writer output?</a></li> </ul> <h1> <a id="troubleshooting" class="anchor" href="#troubleshooting" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>TroubleShooting</h1> <ul> -<li><a href="#Getting-tablestatus.lock-issues-When-loading-data">Getting tablestatus.lock issues When loading data</a></li> +<li><a href="#getting-tablestatuslock-issues-when-loading-data">Getting tablestatus.lock issues When loading data</a></li> <li><a href="#failed-to-load-thrift-libraries">Failed to load thrift libraries</a></li> <li><a href="#failed-to-launch-the-spark-shell">Failed to launch the Spark Shell</a></li> <li><a href="#failed-to-execute-load-query-on-cluster">Failed to execute load query on cluster</a></li> <li><a href="#failed-to-execute-insert-query-on-cluster">Failed to execute insert query on cluster</a></li> <li><a href="#failed-to-connect-to-hiveuser-with-thrift">Failed to connect to hiveuser with thrift</a></li> -<li><a href="#failed-to-read-the-metastore-db-during-table">Failed to read the metastore db during table</a></li> +<li><a href="#failed-to-read-the-metastore-db-during-table-creation">Failed to read the metastore db during table creation</a></li> <li><a href="#failed-to-load-data-on-the-cluster">Failed to load data on the cluster</a></li> <li><a href="#failed-to-insert-data-on-the-cluster">Failed to insert data on the cluster</a></li> <li><a href="#failed-to-execute-concurrent-operations-on-table-by-multiple-workers">Failed to execute Concurrent Operations(Load,Insert,Update) on table by multiple workers</a></li> @@ -292,7 +292,7 @@ The property carbon.lock.type configuration specifies the type of lock to be acq <p>In order to build CarbonData project it is necessary to specify the spark profile. The spark profile sets the Spark Version. You need to specify the <code>spark version</code> while using Maven to build project.</p> <h2> <a id="how-carbon-will-behave-when-execute-insert-operation-in-abnormal-scenarios" class="anchor" href="#how-carbon-will-behave-when-execute-insert-operation-in-abnormal-scenarios" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>How Carbon will behave when execute insert operation in abnormal scenarios?</h2> -<p>Carbon support insert operation, you can refer to the syntax mentioned in <a href="dml-operation-on-carbondata.html">DML Operations on CarbonData</a>. +<p>Carbon support insert operation, you can refer to the syntax mentioned in <a href="./dml-of-carbondata.html">DML Operations on CarbonData</a>. First, create a source table in spark-sql and load data into this created table.</p> <pre><code>CREATE TABLE source_table( id String, @@ -312,7 +312,7 @@ id name city id String, city String, name String) -STORED BY 'carbondata'; +STORED AS carbondata; </code></pre> <pre><code>INSERT INTO TABLE carbon_table SELECT * FROM source_table; </code></pre> @@ -341,7 +341,7 @@ id city name When SubQuery predicate is present in the query.</li> </ul> <p>Example:</p> -<pre><code>create table gdp21(cntry smallint, gdp double, y_year date) stored by 'carbondata'; +<pre><code>create table gdp21(cntry smallint, gdp double, y_year date) stored as carbondata; create datamap ag1 on table gdp21 using 'preaggregate' as select cntry, sum(gdp) from gdp21 group by cntry; select ctry from pop1 where ctry in (select cntry from gdp21 group by cntry); </code></pre> @@ -351,7 +351,7 @@ select ctry from pop1 where ctry in (select cntry from gdp21 group by cntry); When aggregate function along with 'in' filter.</li> </ul> <p>Example:</p> -<pre><code>create table gdp21(cntry smallint, gdp double, y_year date) stored by 'carbondata'; +<pre><code>create table gdp21(cntry smallint, gdp double, y_year date) stored as carbondata; create datamap ag1 on table gdp21 using 'preaggregate' as select cntry, sum(gdp) from gdp21 group by cntry; select cntry, sum(gdp) from gdp21 where cntry in (select ctry from pop1) group by cntry; </code></pre> @@ -361,7 +361,7 @@ select cntry, sum(gdp) from gdp21 where cntry in (select ctry from pop1) group b When aggregate function having 'join' with equal filter.</li> </ul> <p>Example:</p> -<pre><code>create table gdp21(cntry smallint, gdp double, y_year date) stored by 'carbondata'; +<pre><code>create table gdp21(cntry smallint, gdp double, y_year date) stored as carbondata; create datamap ag1 on table gdp21 using 'preaggregate' as select cntry, sum(gdp) from gdp21 group by cntry; select cntry,sum(gdp) from gdp21,pop1 where cntry=ctry group by cntry; </code></pre> @@ -566,8 +566,7 @@ For example, you can use scp to copy this file to all the nodes.</p> <script> // Show selected style on nav item $(function() { $('.b-nav__faq').addClass('selected'); }); -</script> -</div> +</script></div> </div> </div> </div> @@ -582,4 +581,4 @@ $(function() { $('.b-nav__faq').addClass('selected'); }); </section><!-- End systemblock part --> <script src="js/custom.js"></script> </body> -</html> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/carbondata-site/blob/a51dc596/content/file-structure-of-carbondata.html ---------------------------------------------------------------------- diff --git a/content/file-structure-of-carbondata.html b/content/file-structure-of-carbondata.html index b96d622..c14ea6d 100644 --- a/content/file-structure-of-carbondata.html +++ b/content/file-structure-of-carbondata.html @@ -159,7 +159,7 @@ <div class="nav__inner"> <a class="b-nav__intro nav__item" href="./introduction.html">introduction</a> <a class="b-nav__quickstart nav__item" href="./quick-start-guide.html">quick start</a> - <a class="b-nav__uses nav__item" href="./usescases.html">use cases</a> + <a class="b-nav__uses nav__item" href="./usecases.html">use cases</a> <div class="nav__item nav__item__with__subs"> <a class="b-nav__docs nav__item nav__sub__anchor" href="./language-manual.html">Language Reference</a> @@ -179,9 +179,9 @@ <a class="nav__item nav__sub__item" href="./timeseries-datamap-guide.html">Time Series</a> </div> - <a class="b-nav__s3 nav__item" href="./s3-guide.html">S3 Support</a> <a class="b-nav__api nav__item" href="./sdk-guide.html">API</a> <a class="b-nav__perf nav__item" href="./performance-tuning.html">Performance Tuning</a> + <a class="b-nav__s3 nav__item" href="./s3-guide.html">S3 Storage</a> <a class="b-nav__faq nav__item" href="./faq.html">FAQ</a> <a class="b-nav__contri nav__item" href="./how-to-contribute-to-apache-carbondata.html">Contribute</a> <a class="b-nav__security nav__item" href="./security.html">Security</a> @@ -194,9 +194,9 @@ <div class="b-nav__uses navindicator__item"></div> <div class="b-nav__docs navindicator__item"></div> <div class="b-nav__datamap navindicator__item"></div> - <div class="b-nav__s3 navindicator__item"></div> <div class="b-nav__api navindicator__item"></div> <div class="b-nav__perf navindicator__item"></div> + <div class="b-nav__s3 navindicator__item"></div> <div class="b-nav__faq navindicator__item"></div> <div class="b-nav__contri navindicator__item"></div> <div class="b-nav__security navindicator__item"></div> @@ -211,30 +211,138 @@ <div class="col-sm-12 col-md-12"> <div> <h1> -<a id="carbondata-file-structure" class="anchor" href="#carbondata-file-structure" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>CarbonData File Structure</h1> +<a id="carbondata-table-structure" class="anchor" href="#carbondata-table-structure" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>CarbonData table structure</h1> <p>CarbonData files contain groups of data called blocklets, along with all required information like schema, offsets and indices etc, in a file header and footer, co-located in HDFS.</p> <p>The file footer can be read once to build the indices in memory, which can be utilized for optimizing the scans and processing for all subsequent queries.</p> -<h3> -<a id="understanding-carbondata-file-structure" class="anchor" href="#understanding-carbondata-file-structure" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Understanding CarbonData File Structure</h3> +<p>This document describes the what a CarbonData table looks like in a HDFS directory, files written and content of each file.</p> <ul> -<li>Block : It would be as same as HDFS block, CarbonData creates one file for each data block, user can specify TABLE_BLOCKSIZE during creation table. Each file contains File Header, Blocklets and File Footer.</li> -</ul> -<p><a href="../docs/images/carbon_data_file_structure_new.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/carbon_data_file_structure_new.png?raw=true" alt="CarbonData File Structure" style="max-width:100%;"></a></p> +<li> +<p><a href="#file-directory-structure">File Directory Structure</a></p> +</li> +<li> +<p><a href="#file-content-details">File Content details</a></p> <ul> -<li>File Header : It contains CarbonData file version number, list of column schema and schema updation timestamp.</li> -<li>File Footer : it contains Number of rows, segmentinfo ,all blocklets? info and index, you can find the detail from the below diagram.</li> -<li>Blocklet : Rows are grouped to form a blocklet, the size of the blocklet is configurable and default size is 64MB, Blocklet contains Column Page groups for each column.</li> -<li>Column Page Group : Data of one column and it is further divided into pages, it is guaranteed to be contiguous in file.</li> -<li>Page : It has the data of one column and the number of row is fixed to 32000 size.</li> -</ul> -<p><a href="../docs/images/carbon_data_format_new.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/carbon_data_format_new.png?raw=true" alt="CarbonData File Format" style="max-width:100%;"></a></p> -<h3> -<a id="each-page-contains-three-types-of-data" class="anchor" href="#each-page-contains-three-types-of-data" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Each page contains three types of data</h3> +<li><a href="#schema-file-format">Schema file format</a></li> +<li> +<a href="#carbondata-file-format">CarbonData file format</a> <ul> -<li>Data Page: Contains the encoded data of a column of columns.</li> -<li>Row ID Page (optional): Contains the row ID mappings used when the data page is stored as an inverted index.</li> -<li>RLE Page (optional): Contains additional metadata used when the data page is RLE coded.</li> +<li> +<a href="#blocklet-format">Blocklet format</a> +<ul> +<li><a href="#v1">V1</a></li> +<li><a href="#v2">V2</a></li> +<li><a href="#v3">V3</a></li> +</ul> +</li> +<li><a href="#footer-format">Footer format</a></li> </ul> +</li> +<li><a href="#carbonindex-file-format">carbonindex file format</a></li> +<li><a href="#dictionary-file-format">Dictionary file format</a></li> +<li><a href="#tablestatus-file-format">tablestatus file format</a></li> +</ul> +</li> +</ul> +<h2> +<a id="file-directory-structure" class="anchor" href="#file-directory-structure" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>File Directory Structure</h2> +<p>The CarbonData files are stored in the location specified by the <em><strong>carbon.storelocation</strong></em> configuration (configured in carbon.properties; if not configured, the default is ../carbon.store).</p> +<p>The file directory structure is as below:</p> +<p><a href="../docs/images/2-1_1.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-1_1.png?raw=true" alt="File Directory Structure" style="max-width:100%;"></a></p> +<ol> +<li>ModifiedTime.htmlt records the timestamp of the metadata with the modification time attribute of the file. When the drop table and create table are used, the modification time of the file is updated.This is common to all databases and hence is kept in parallel to databases</li> +<li>The <strong>default</strong> is the database name and contains the user tables.default is used when user doesn't specify any database name;else user configured database name will be the directory name. user_table is the table name.</li> +<li>Metadata directory stores schema files, tablestatus and dictionary files (including .dict, .dictmeta and .sortindex). There are three types of metadata data information files.</li> +<li>data and index files are stored under directory named <strong>Fact</strong>. The Fact directory has a Part0 partition directory, where 0 is the partition number.</li> +<li>There is a Segment_0 directory under the Part0 directory, where 0 is the segment number.</li> +<li>There are two types of files, carbondata and carbonindex, in the Segment_0 directory.</li> +</ol> +<h2> +<a id="file-content-details" class="anchor" href="#file-content-details" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>File Content details</h2> +<p>When the table is created, the user_table directory is generated, and a schema file is generated in the Metadata directory for recording the table structure.</p> +<p>When loading data in batches, each batch loading generates a new segment directory. The scheduling tries to control a task processing data loading task on each node. Each task will generate multiple carbondata files and one carbonindex file.</p> +<p>During global dictionary generation, if the two-pass scheme is used, before the data is loaded, the corresponding dict, dictmeta and sortindex files are generated for each dictionary-encoded column, and partial dictionary files can be provided by the pre-define dictionary method to reduce the need. A dictionary-encoded column is generated by scanning the full amount of data; a dictionary file of all dictionary code columns can also be provided by the all dictionary method to avoid scanning data. If the single-pass scheme is adopted, the global dictionary code is generated in real time during data loading, and after the data is loaded, the dictionary is solidified into a dictionary file.</p> +<p>The following sections use the Java object generated by the thrift file describing the carbondata file format to explain the contents of each file one by one (you can also directly read the format defined in the <a href="https://github.com/apache/carbondata/tree/master/format/src/main/thrift" target=_blank>thrift file</a>)</p> +<h3> +<a id="schema-file-format" class="anchor" href="#schema-file-format" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Schema file format</h3> +<p>The contents of the schema file is as shown below</p> +<p><a href="../docs/images/2-2_1.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-2_1.png?raw=true" alt="Schema file format" style="max-width:100%;"></a></p> +<ol> +<li>TableSchema class +The TableSchema class does not store the table name, it is infered from the directory name(user_table). +tableProperties is used to record table-related properties, such as: table_blocksize.</li> +<li>ColumnSchema class +Encoders are used to record the encoding used in column storage. +columnProperties is used to record column related properties.</li> +<li>BucketingInfo class +When creating a bucket table, you can specify the number of buckets in the table and the column to splitbuckets.</li> +<li>DataType class +Describes the data types supported by CarbonData.</li> +<li>Encoding class +Several encodings that may be used in CarbonData files.</li> +</ol> +<h3> +<a id="carbondata-file-format" class="anchor" href="#carbondata-file-format" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>CarbonData file format</h3> +<h4> +<a id="file-header" class="anchor" href="#file-header" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>File Header</h4> +<p>It contains CarbonData file version number, list of column schema and schema updation timestamp.</p> +<p><a href="../docs/images/carbon_data_file_structure_new.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/carbon_data_file_structure_new.png?raw=true" alt="File Header" style="max-width:100%;"></a></p> +<p>The carbondata file consists of multiple blocklets and footer parts. The blocklet is the dataset inside the carbondata file (the latest V3 format, the default configuration is 64MB), each blocklet contains a ColumnChunk for each column, and a ColumnChunk may contain one or more Column Pages.</p> +<p>The carbondata file currently supports V1, V2 and V3 versions. The main difference is the change of the blocklet part, which is introduced one by one.</p> +<h4> +<a id="blocklet-format" class="anchor" href="#blocklet-format" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Blocklet format</h4> +<h5> +<a id="v1" class="anchor" href="#v1" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>V1</h5> +<p>Blocket consists of all column data pages, RLE pages, and rowID pages. Since the pages in the blocklet are grouped according to the page type, the three pieces of data of each column are distributed and stored in the blocklet, and the offset and length information of all the pages need to be recorded in the footer part.</p> +<p><a href="../docs/images/2-3_1.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-3_1.png?raw=true" alt="V1" style="max-width:100%;"></a></p> +<h5> +<a id="v2" class="anchor" href="#v2" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>V2</h5> +<p>The blocklet consists of ColumnChunk for all columns. The ColumnChunk for a column consists of a ColumnPage, which includes the data chunk header, data page, RLE page, and rowID page. Since ColumnChunk aggregates the three types of Page data of the column together, it can read the column data using fewer readers. Since the header part records the length information of all the pages, the footer part only needs to record the offset and length of the ColumnChunk, and also reduces the amount of footer data.</p> +<p><a href="../docs/images/2-3_2.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-3_2.png?raw=true" alt="V2" style="max-width:100%;"></a></p> +<h5> +<a id="v3" class="anchor" href="#v3" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>V3</h5> +<p>The blocklet is also composed of ColumnChunks of all columns. What is changed is that a ColumnChunk consists of one or more Column Pages, and Column Page adds a new BlockletMinMaxIndex.</p> +<p>Compared with V2: The blocklet data volume of V2 format defaults to 120,000 lines, and the blocklet data volume of V3 format defaults to 64MB. For the same size data file, the information of the footer part index metadata may be further reduced; meanwhile, the V3 format adds a new page. Level data filtering, and the amount of data per page is only 32,000 lines by default, which is much less than the 120,000 lines of V2 format. The accuracy of data filtering hits further, and more data can be filtered out before decompressing data.</p> +<p><a href="../docs/images/2-3_3.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-3_3.png?raw=true" alt="V3" style="max-width:100%;"></a></p> +<h4> +<a id="footer-format" class="anchor" href="#footer-format" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Footer format</h4> +<p>Footer records each carbondata +All blocklet data distribution information and statistical related metadata information (minmax, startkey/endkey) inside the file.</p> +<p><a href="../docs/images/2-3_4.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-3_4.png?raw=true" alt="Footer format" style="max-width:100%;"></a></p> +<ol> +<li>BlockletInfo3 is used to record the offset and length of all ColumnChunk3.</li> +<li>SegmentInfo is used to record the number of columns and the cardinality of each column.</li> +<li>BlockletIndex includes BlockletMinMaxIndex and BlockletBTreeIndex.</li> +</ol> +<p>BlockletBTreeIndex is used to record the startkey/endkey of all blocklets in the block. When querying, the startkey/endkey of the query is generated by filtering conditions combined with.htmlkey. With BlocketBtreeIndex, the range of blocklets satisfying the conditions in each block can be delineated.</p> +<p>BlockletMinMaxIndex is used to record the min/max value of all columns in the blocklet. By using the min/max check on the filter condition, you can skip the block/blocklet that does not satisfy the condition.</p> +<h3> +<a id="carbonindex-file-format" class="anchor" href="#carbonindex-file-format" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>carbonindex file format</h3> +<p>Extract the BlockletIndex part of the footer part to generate the carbonindex file. Load data in batches, schedule as much as possible to control a node to start a task, each task generates multiple carbondata files and a carbonindex file. The carbonindex file records the index information of all the blocklets in all the carbondata files generated by the task.</p> +<p>As shown in the figure, the index information corresponding to a block is recorded by a BlockIndex object, including carbondata filename, footer offset and BlockletIndex. The BlockIndex data volume is less than the footer. The file is directly used to build the index on the driver side when querying, without having to skip the footer part of the data volume of multiple data files.</p> +<p><a href="../docs/images/2-4_1.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-4_1.png?raw=true" alt="carbonindex file format" style="max-width:100%;"></a></p> +<h3> +<a id="dictionary-file-format" class="anchor" href="#dictionary-file-format" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>Dictionary file format</h3> +<p>For each dictionary encoded column, a dictionary file is used to store the dictionary metadata for that column.</p> +<ol> +<li>dict file records the distinct value list of a column</li> +</ol> +<p>For the first time dataloading, the file is generated using a distinct value list of a column. The value in the file is unordered; the subsequent append is used. In the second step of dataloading (Data Convert Step), the dictionary code column will replace the true value of the data with the dictionary key.</p> +<p><a href="../docs/images/2-5_1.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-5_1.png?raw=true" alt="Dictionary file format" style="max-width:100%;"></a></p> +<ol start="2"> +<li>dictmeta records the metadata description of the new distinct value of each dataloading</li> +</ol> +<p>The dictionary cache uses this information to incrementally flush the cache.</p> +<p><a href="../docs/images/2-5_2.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-5_2.png?raw=true" alt="Dictionary Chunk" style="max-width:100%;"></a></p> +<ol start="3"> +<li>sortindex records the result set of the key code of the dictionary code sorted by value.</li> +</ol> +<p>In dataLoading, if there is a new dictionary value, the sortindex file will be regenerated using all the dictionary codes.</p> +<p>Filtering queries based on dictionary code columns need to convert the value filter filter to the key filter condition. Using the sortindex file, you can quickly construct an ordered value sequence to quickly find the key value corresponding to the value, thus speeding up the conversion process.</p> +<p><a href="../docs/images/2-5_3.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-5_3.png?raw=true" alt="sortindex file format" style="max-width:100%;"></a></p> +<h3> +<a id="tablestatus-file-format" class="anchor" href="#tablestatus-file-format" aria-hidden="true"><span aria-hidden="true" class="octicon octicon-link"></span></a>tablestatus file format</h3> +<p>Tablestatus records the segment-related information (in gson format) for each load and merge, including load time, load status, segment name, whether it was deleted, and the segment name incorporated. Regenerate the tablestatusfile after each load or merge.</p> +<p><a href="../docs/images/2-6_1.png?raw=true" target="_blank" rel="noopener noreferrer"><img src="https://github.com/apache/carbondata/blob/master/docs/images/2-6_1.png?raw=true" alt="tablestatus file format" style="max-width:100%;"></a></p> <script> $(function() { // Show selected style on nav item @@ -245,8 +353,7 @@ $(function() { $('.b-nav__docs').parent().toggleClass('nav__item__with__subs--expanded'); } }); -</script> -</div> +</script></div> </div> </div> </div> @@ -261,4 +368,4 @@ $(function() { </section><!-- End systemblock part --> <script src="js/custom.js"></script> </body> -</html> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/carbondata-site/blob/a51dc596/content/how-to-contribute-to-apache-carbondata.html ---------------------------------------------------------------------- diff --git a/content/how-to-contribute-to-apache-carbondata.html b/content/how-to-contribute-to-apache-carbondata.html index 946013f..122b763 100644 --- a/content/how-to-contribute-to-apache-carbondata.html +++ b/content/how-to-contribute-to-apache-carbondata.html @@ -159,7 +159,7 @@ <div class="nav__inner"> <a class="b-nav__intro nav__item" href="./introduction.html">introduction</a> <a class="b-nav__quickstart nav__item" href="./quick-start-guide.html">quick start</a> - <a class="b-nav__uses nav__item" href="./usescases.html">use cases</a> + <a class="b-nav__uses nav__item" href="./usecases.html">use cases</a> <div class="nav__item nav__item__with__subs"> <a class="b-nav__docs nav__item nav__sub__anchor" href="./language-manual.html">Language Reference</a> @@ -179,9 +179,9 @@ <a class="nav__item nav__sub__item" href="./timeseries-datamap-guide.html">Time Series</a> </div> - <a class="b-nav__s3 nav__item" href="./s3-guide.html">S3 Support</a> <a class="b-nav__api nav__item" href="./sdk-guide.html">API</a> <a class="b-nav__perf nav__item" href="./performance-tuning.html">Performance Tuning</a> + <a class="b-nav__s3 nav__item" href="./s3-guide.html">S3 Storage</a> <a class="b-nav__faq nav__item" href="./faq.html">FAQ</a> <a class="b-nav__contri nav__item" href="./how-to-contribute-to-apache-carbondata.html">Contribute</a> <a class="b-nav__security nav__item" href="./security.html">Security</a> @@ -194,9 +194,9 @@ <div class="b-nav__uses navindicator__item"></div> <div class="b-nav__docs navindicator__item"></div> <div class="b-nav__datamap navindicator__item"></div> - <div class="b-nav__s3 navindicator__item"></div> <div class="b-nav__api navindicator__item"></div> <div class="b-nav__perf navindicator__item"></div> + <div class="b-nav__s3 navindicator__item"></div> <div class="b-nav__faq navindicator__item"></div> <div class="b-nav__contri navindicator__item"></div> <div class="b-nav__security navindicator__item"></div> @@ -352,8 +352,7 @@ $ git push <GitHub_user> --delete <my-branch> <script> // Show selected style on nav item $(function() { $('.b-nav__contri').addClass('selected'); }); -</script> -</div> +</script></div> </div> </div> </div> @@ -368,4 +367,4 @@ $(function() { $('.b-nav__contri').addClass('selected'); }); </section><!-- End systemblock part --> <script src="js/custom.js"></script> </body> -</html> +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/carbondata-site/blob/a51dc596/content/index.html ---------------------------------------------------------------------- diff --git a/content/index.html b/content/index.html index e9e6e3b..f059d16 100644 --- a/content/index.html +++ b/content/index.html @@ -477,7 +477,7 @@ </ol> <p class="title-info"> For detailed reference on CarbonData, read the <a - href="mainpage.html">User Guide</a>. + href="documentation.html">User Guide</a>. </p> </div>
