Added: phoenix/site/publish/tablesample.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/tablesample.html?rev=1811040&view=auto ============================================================================== --- phoenix/site/publish/tablesample.html (added) +++ phoenix/site/publish/tablesample.html Wed Oct 4 07:09:48 2017 @@ -0,0 +1,484 @@ + +<!DOCTYPE html> +<!-- + Generated by Apache Maven Doxia at 2017-10-04 + Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) +--> +<html xml:lang="en" lang="en"> + + <head> + <meta charset="UTF-8" /> + <title>Table Sampliing | Apache Phoenix</title> + <meta name="viewport" content="width=device-width, initial-scale=1.0" /> + <meta name="description" content="" /> + <meta http-equiv="content-language" content="en" /> + + <link href="//netdna.bootstrapcdn.com/bootswatch/2.3.2/flatly/bootstrap.min.css" rel="stylesheet" /> + <link href="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.1/css/bootstrap-responsive.min.css" rel="stylesheet" /> + <link href="./css/bootswatch.css" rel="stylesheet" /> + <link href="./css/reflow-skin.css" rel="stylesheet" /> + + <link href="//yandex.st/highlightjs/7.5/styles/default.min.css" rel="stylesheet" /> + + <link href="./css/lightbox.css" rel="stylesheet" /> + + <link href="./css/site.css" rel="stylesheet" /> + <link href="./css/print.css" rel="stylesheet" media="print" /> + + <!-- Le HTML5 shim, for IE6-8 support of HTML5 elements --> + <!--[if lt IE 9]> + <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script> + <![endif]--> + + + + </head> + + <body class="page-tablesample project-phoenix-site" data-spy="scroll" data-offset="60" data-target="#toc-scroll-target"> + + <div class="navbar navbar-fixed-top"> + <div class="navbar-inner"> + <div class="container"> + <a class="btn btn-navbar" data-toggle="collapse" data-target="#top-nav-collapse"> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </a> + <a class="brand" href="index.html"><div class="xtoplogo"></div></a> + <div class="nav-collapse collapse" id="top-nav-collapse"> + <ul class="nav pull-right"> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">About <b class="caret"></b></a> + <ul class="dropdown-menu"> + <li ><a href="index.html" title="Overview">Overview</a></li> + <li ><a href="who_is_using.html" title="Who is Using">Who is Using</a></li> + <li ><a href="recent.html" title="Recent Improvements">Recent Improvements</a></li> + <li ><a href="roadmap.html" title="Roadmap">Roadmap</a></li> + <li ><a href="news.html" title="News">News</a></li> + <li ><a href="performance.html" title="Performance">Performance</a></li> + <li ><a href="team.html" title="Team">Team</a></li> + <li ><a href="resources.html" title="Presentations">Presentations</a></li> + <li ><a href="mailing_list.html" title="Mailing Lists">Mailing Lists</a></li> + <li ><a href="source.html" title="Source Repository">Source Repository</a></li> + <li ><a href="issues.html" title="Issue Tracking">Issue Tracking</a></li> + <li ><a href="download.html" title="Download">Download</a></li> + <li ><a href="installation.html" title="Installation">Installation</a></li> + <li class="divider"/> + <li ><a href="contributing.html" title="How to Contribute">How to Contribute</a></li> + <li ><a href="develop.html" title="How to Develop">How to Develop</a></li> + <li ><a href="building_website.html" title="How to Update Website">How to Update Website</a></li> + <li ><a href="release.html" title="How to Release">How to Release</a></li> + <li class="divider"/> + <li ><a href="http://www.apache.org/licenses/" title="License" class="externalLink">License</a></li> + </ul> + </li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Using <b class="caret"></b></a> + <ul class="dropdown-menu"> + <li ><a href="faq.html" title="F.A.Q.">F.A.Q.</a></li> + <li ><a href="Phoenix-in-15-minutes-or-less.html" title="Quick Start">Quick Start</a></li> + <li ><a href="building.html" title="Building">Building</a></li> + <li ><a href="tuning_guide.html" title="Tuning">Tuning</a></li> + <li ><a href="tuning.html" title="Configuration">Configuration</a></li> + <li ><a href="upgrading.html" title="Backward Compatibility">Backward Compatibility</a></li> + <li ><a href="release_notes.html" title="Release Notes">Release Notes</a></li> + <li ><a href="pherf.html" title="Performance Testing">Performance Testing</a></li> + <li class="divider"/> + <li ><a href="phoenix_spark.html" title="Apache Spark Integration">Apache Spark Integration</a></li> + <li ><a href="hive_storage_handler.html" title="Phoenix Storage Handler for Apache Hive">Phoenix Storage Handler for Apache Hive</a></li> + <li ><a href="pig_integration.html" title="Apache Pig Integration">Apache Pig Integration</a></li> + <li ><a href="phoenix_mr.html" title="Map Reduce Integration">Map Reduce Integration</a></li> + <li ><a href="flume.html" title="Apache Flume Plugin">Apache Flume Plugin</a></li> + <li ><a href="kafka.html" title="Apache Kafka Plugin">Apache Kafka Plugin</a></li> + </ul> + </li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Addons <b class="caret"></b></a> + <ul class="dropdown-menu"> + <li ><a href="http://docs.aws.amazon.com/ElasticMapReduce/latest/ReleaseGuide/emr-phoenix.html" title="Phoenix on Amazon EMR" class="externalLink">Phoenix on Amazon EMR</a></li> + <li ><a href="http://python-phoenixdb.readthedocs.io/en/latest" title="Phoenix Adapter for Python" class="externalLink">Phoenix Adapter for Python</a></li> + <li ><a href="phoenix_orm.html" title="Phoenix ORM Library">Phoenix ORM Library</a></li> + </ul> + </li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Features <b class="caret"></b></a> + <ul class="dropdown-menu"> + <li ><a href="transactions.html" title="Transactions">Transactions</a></li> + <li ><a href="udf.html" title="User-defined Functions">User-defined Functions</a></li> + <li class="divider"/> + <li ><a href="secondary_indexing.html" title="Secondary Indexes">Secondary Indexes</a></li> + <li ><a href="columnencoding.html" title="Storage Formats">Storage Formats</a></li> + <li ><a href="atomic_upsert.html" title="Atomic Upsert">Atomic Upsert</a></li> + <li ><a href="namspace_mapping.html" title="Namespace Mapping">Namespace Mapping</a></li> + <li ><a href="update_statistics.html" title="Statistics Collection">Statistics Collection</a></li> + <li ><a href="rowtimestamp.html" title="Row Timestamp Column">Row Timestamp Column</a></li> + <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> + <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> + <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li class="active"><a href="" title="Table Sampling">Table Sampling</a></li> + <li class="divider"/> + <li ><a href="views.html" title="Views">Views</a></li> + <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> + <li ><a href="dynamic_columns.html" title="Dynamic Columns">Dynamic Columns</a></li> + <li class="divider"/> + <li ><a href="bulk_dataload.html" title="Bulk Loading">Bulk Loading</a></li> + <li ><a href="server.html" title="Query Server">Query Server</a></li> + <li ><a href="metrics.html" title="Metrics">Metrics</a></li> + <li ><a href="tracing.html" title="Tracing">Tracing</a></li> + <li ><a href="cursors.html" title="Cursor">Cursor</a></li> + </ul> + </li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Reference <b class="caret"></b></a> + <ul class="dropdown-menu"> + <li ><a href="language/index.html" title="Grammar">Grammar</a></li> + <li ><a href="language/functions.html" title="Functions">Functions</a></li> + <li ><a href="language/datatypes.html" title="Datatypes">Datatypes</a></li> + <li ><a href="array_type.html" title="ARRAY type">ARRAY type</a></li> + <li class="divider"/> + <li ><a href="sequences.html" title="Sequences">Sequences</a></li> + <li ><a href="joins.html" title="Joins">Joins</a></li> + <li ><a href="subqueries.html" title="Subqueries">Subqueries</a></li> + </ul> + </li> + </ul> + </div><!--/.nav-collapse --> + </div> + </div> + </div> + + <div class="container"> + + <!-- Masthead + ================================================== --> + + <header> + </header> + + <div class="main-body"> + <div class="row"> + <div class="span12"> + <div class="body-content"> +<div class="page-header"> + <h1>Table Sampliing</h1> +</div> +<p>To support table sampling, similar to the Postgresql and T-SQL syntax, a TABLESAMPLE clause has been encorporated into the table aliases ref as of Phoenix 4.12. The general syntax is described <a href="language/index.html#aliased_table_ref">here</a>. This feature limits the number of rows returned from a table to a PERCENT of rows. See <a class="externalLink" href="https://issues.apache.org/jira/browse/PHOENIX-153">PHOENIX-153</a> for implementation details.</p> +<p>This feature is implmented with a Bernoulli trial, a consistent-hashing based tablesampler, to achieve a Bernoulli sampling on a given population (rows). When executed, given a sampling rate, it leverages Phoenixâs stats as well HBaseâs region distribution to perform a table sampling.</p> +<p>As part of the <a href="update_statistics.html">statistics collection</a>, a guide post (a row referrence) is created for rows at equidistant bytes from each other. When sampling is required, a Bernoulli trial process is applied repeatedly on each guide post on each region with a probability that porportional to the sampling rate. A included guide post will result in all the rows in between this and next guide post being included into sample population.</p> +<div class="section"> + <div class="section"> + <div class="section"> + <h4 id="Performance">Performance</h4> + <p>Sampling on a table with a sampling rate = 100% will cost same amount of computational resource as a query without sampling. This resouces consumption quicky fall off when a smaller sampling rate is choosing. In general, the amortized complexity for sampling process is <tt>O(k + mn)</tt> , with n being the number of the regions of the sampled hbase table, m being the number of guide posts, and k being the size of sampled population.</p> + </div> + <div class="section"> + <h4 id="Repeatable">Repeatable</h4> + <p>Repeatable means it guarantees that the repeated sampling on the same table will resulted in the same sampled result.</p> + <p>Repeatable feature is made possible by a consistent-hashing process applied on the binary representation of the start rowkey of each guideposts of each region when sampling process is in action. During hashing, a FNV1 implementation with lazy mod method is adopted by default. see <a class="externalLink" href="http://www.isthe.com/chongo/tech/comp/fnv/">FNV1</a>.</p> + </div> + </div> +</div> +<div class="section"> + <h2 id="Examples">Examples</h2> + <p>For example, to tablesample a table , you would execute the following command. Please note, the sampling rate is a numeric value between 0 and 100, inclusive.</p> + <div class="source"> + <pre>select * from PERSON TABLESAMPLE(12.08); +</pre> + </div> + <p>Some more examples:</p> + <div class="source"> + <pre>select * from PERSON TABLESAMPLE (12.08) where ADDRESS = 'CA' OR name>'aaa'; +select count( * ) from PERSON TABLESAMPLE (12.08) LIMIT 2 +select count( * ) from (select NAME from PERSON TABLESAMPLE (49) limit 20) +select * from (select /+NO_INDEX/ * from PERSON tablesample (10) where Name > 'tina10') where ADDRESS = 'CA' +select * from PERSON1, PERSON2 tablesample (70) where PERSON1.Name = PERSON2.NAME +select /+NO_INDEX/ count( * ) from PERSON tableSample (19), US_POPULATION tableSample (28) where PERSON.Name > US_POPULATION.STATE +upsert into personbig(ID, ADDRESS) select id, address from personbig tablesample (1); +</pre> + </div> + <p>To use it in aggregation:</p> + <div class="source"> + <pre>select count( * ) from PERSON TABLESAMPLE (49) LIMIT 2 +select count( * ) from (select NAME from PERSON TABLESAMPLE (49) limit 20) +</pre> + </div> + <p>To explain the query:</p> + <div class="source"> + <pre>explain select count( * ) from PERSON TABLESAMPLE (49) LIMIT 2) +</pre> + </div> +</div> +<div class="section"> + <h2 id="Tuning">Tuning</h2> + <ul> + <li> <p>Due to sampling process, the TABLESAMPLE clause should be used with some caution. For example, a join of two tables is likely to return a match for each row in both tables; however, when tablesampling process is applied on either one or both of the two tables, the joining may not match the expectation when without sampling.</p></li> + <li> <p>The stats needs to be collected in order to achieve the best sampling accuracy. To turn on statistics collection, refer to <a href="update_statistics.html">Statistics Collection</a>.</p></li> + </ul> + <p></p> + <div> + <pre> ALTER TABLE my_table SET GUIDE_POSTS_WIDTH = 10000000 </pre> + </div> * A denser guide post setting improves the sampling accuracy, but reduces the performance at the same time. A comparison is experimented as fig below: + <p><img src="images/sampling_accuracy_perf.jpg" alt="table samping performance" /></p> +</div> + </div> + </div> + </div> + </div> + + </div><!-- /container --> + + <!-- Footer + ================================================== --> + <footer class="well"> + <div class="container"> + <div class="row"> + <div class="span2 bottom-nav"> + <ul class="nav nav-list"> + <li class="nav-header">About</li> + <li > + <a href="index.html" title="Overview">Overview</a> + </li> + <li > + <a href="who_is_using.html" title="Who is Using">Who is Using</a> + </li> + <li > + <a href="recent.html" title="Recent Improvements">Recent Improvements</a> + </li> + <li > + <a href="roadmap.html" title="Roadmap">Roadmap</a> + </li> + <li > + <a href="news.html" title="News">News</a> + </li> + <li > + <a href="performance.html" title="Performance">Performance</a> + </li> + <li > + <a href="team.html" title="Team">Team</a> + </li> + <li > + <a href="resources.html" title="Presentations">Presentations</a> + </li> + <li > + <a href="mailing_list.html" title="Mailing Lists">Mailing Lists</a> + </li> + <li > + <a href="source.html" title="Source Repository">Source Repository</a> + </li> + <li > + <a href="issues.html" title="Issue Tracking">Issue Tracking</a> + </li> + <li > + <a href="download.html" title="Download">Download</a> + </li> + <li > + <a href="installation.html" title="Installation">Installation</a> + </li> + <li > + <a href="http:divider" title=""></a> + </li> + <li > + <a href="contributing.html" title="How to Contribute">How to Contribute</a> + </li> + <li > + <a href="develop.html" title="How to Develop">How to Develop</a> + </li> + <li > + <a href="building_website.html" title="How to Update Website">How to Update Website</a> + </li> + <li > + <a href="release.html" title="How to Release">How to Release</a> + </li> + <li > + <a href="http:divider" title=""></a> + </li> + <li > + <a href="http://www.apache.org/licenses/" title="License" class="externalLink">License</a> + </li> + </ul> + </div> + <div class="span2 bottom-nav"> + <ul class="nav nav-list"> + <li class="nav-header">Using</li> + <li > + <a href="faq.html" title="F.A.Q.">F.A.Q.</a> + </li> + <li > + <a href="Phoenix-in-15-minutes-or-less.html" title="Quick Start">Quick Start</a> + </li> + <li > + <a href="building.html" title="Building">Building</a> + </li> + <li > + <a href="tuning_guide.html" title="Tuning">Tuning</a> + </li> + <li > + <a href="tuning.html" title="Configuration">Configuration</a> + </li> + <li > + <a href="upgrading.html" title="Backward Compatibility">Backward Compatibility</a> + </li> + <li > + <a href="release_notes.html" title="Release Notes">Release Notes</a> + </li> + <li > + <a href="pherf.html" title="Performance Testing">Performance Testing</a> + </li> + <li > + <a href="http:divider" title=""></a> + </li> + <li > + <a href="phoenix_spark.html" title="Apache Spark Integration">Apache Spark Integration</a> + </li> + <li > + <a href="hive_storage_handler.html" title="Phoenix Storage Handler for Apache Hive">Phoenix Storage Handler for Apache Hive</a> + </li> + <li > + <a href="pig_integration.html" title="Apache Pig Integration">Apache Pig Integration</a> + </li> + <li > + <a href="phoenix_mr.html" title="Map Reduce Integration">Map Reduce Integration</a> + </li> + <li > + <a href="flume.html" title="Apache Flume Plugin">Apache Flume Plugin</a> + </li> + <li > + <a href="kafka.html" title="Apache Kafka Plugin">Apache Kafka Plugin</a> + </li> + </ul> + </div> + <div class="span2 bottom-nav"> + <ul class="nav nav-list"> + <li class="nav-header">Features</li> + <li > + <a href="transactions.html" title="Transactions">Transactions</a> + </li> + <li > + <a href="udf.html" title="User-defined Functions">User-defined Functions</a> + </li> + <li > + <a href="http:divider" title=""></a> + </li> + <li > + <a href="secondary_indexing.html" title="Secondary Indexes">Secondary Indexes</a> + </li> + <li > + <a href="columnencoding.html" title="Storage Formats">Storage Formats</a> + </li> + <li > + <a href="atomic_upsert.html" title="Atomic Upsert">Atomic Upsert</a> + </li> + <li > + <a href="namspace_mapping.html" title="Namespace Mapping">Namespace Mapping</a> + </li> + <li > + <a href="update_statistics.html" title="Statistics Collection">Statistics Collection</a> + </li> + <li > + <a href="rowtimestamp.html" title="Row Timestamp Column">Row Timestamp Column</a> + </li> + <li > + <a href="paged.html" title="Paged Queries">Paged Queries</a> + </li> + <li > + <a href="salted.html" title="Salted Tables">Salted Tables</a> + </li> + <li > + <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> + </li> + <li class="active"> + <a href="#" title="Table Sampling">Table Sampling</a> + </li> + <li > + <a href="http:divider" title=""></a> + </li> + <li > + <a href="views.html" title="Views">Views</a> + </li> + <li > + <a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a> + </li> + <li > + <a href="dynamic_columns.html" title="Dynamic Columns">Dynamic Columns</a> + </li> + <li > + <a href="http:divider" title=""></a> + </li> + <li > + <a href="bulk_dataload.html" title="Bulk Loading">Bulk Loading</a> + </li> + <li > + <a href="server.html" title="Query Server">Query Server</a> + </li> + <li > + <a href="metrics.html" title="Metrics">Metrics</a> + </li> + <li > + <a href="tracing.html" title="Tracing">Tracing</a> + </li> + <li > + <a href="cursors.html" title="Cursor">Cursor</a> + </li> + </ul> + </div> + <div class="span3 bottom-nav"> + <ul class="nav nav-list"> + <li class="nav-header">Reference</li> + <li > + <a href="language/index.html" title="Grammar">Grammar</a> + </li> + <li > + <a href="language/functions.html" title="Functions">Functions</a> + </li> + <li > + <a href="language/datatypes.html" title="Datatypes">Datatypes</a> + </li> + <li > + <a href="array_type.html" title="ARRAY type">ARRAY type</a> + </li> + <li > + <a href="http:divider" title=""></a> + </li> + <li > + <a href="sequences.html" title="Sequences">Sequences</a> + </li> + <li > + <a href="joins.html" title="Joins">Joins</a> + </li> + <li > + <a href="subqueries.html" title="Subqueries">Subqueries</a> + </li> + </ul> + </div> + <div class="span3 bottom-description"> + <form action="http://search-hadoop.com/?" method="get"><input value="Phoenix" name="fc_project" type="hidden"><input placeholder="Search Phoenix…" required="required" style="width:170px;" size="18" name="q" id="query" type="search"></form> + </div> + </div> + </div> + </footer> + + <div class="container subfooter"> + <div class="row"> + <div class="span12"> + <p class="pull-right"><a href="#">Back to top</a></p> + <p class="copyright">Copyright ©2017 <a href="http://www.apache.org">Apache Software Foundation</a>. All Rights Reserved.</p> + </div> + </div> + </div> + + <!-- Le javascript + ================================================== --> + <!-- Placed at the end of the document so the pages load faster --> + <script src="//ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script> + + <script src="//netdna.bootstrapcdn.com/twitter-bootstrap/2.3.2/js/bootstrap.min.js"></script> + <script src="./js/lightbox.js"></script> + <script src="./js/jquery.smooth-scroll.min.js"></script> + <!-- back button support for smooth scroll --> + <script src="./js/jquery.ba-bbq.min.js"></script> + <script src="//yandex.st/highlightjs/7.5/highlight.min.js"></script> + + <script src="./js/reflow-skin.js"></script> + + </body> +</html>
Modified: phoenix/site/publish/team.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/team.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/team.html (original) +++ phoenix/site/publish/team.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-24 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li ><a href="views.html" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -528,6 +529,9 @@ <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li > Modified: phoenix/site/publish/tracing.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/tracing.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/tracing.html (original) +++ phoenix/site/publish/tracing.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-15 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li ><a href="views.html" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -523,6 +524,9 @@ Connection conn = DriverManager.getConne <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li > Modified: phoenix/site/publish/transactions.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/transactions.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/transactions.html (original) +++ phoenix/site/publish/transactions.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-15 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li ><a href="views.html" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -420,6 +421,9 @@ DELETE FROM my_other_table WHERE k=2; <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li > Modified: phoenix/site/publish/tuning.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/tuning.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/tuning.html (original) +++ phoenix/site/publish/tuning.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-15 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li ><a href="views.html" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -594,6 +595,9 @@ <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li > Modified: phoenix/site/publish/tuning_guide.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/tuning_guide.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/tuning_guide.html (original) +++ phoenix/site/publish/tuning_guide.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-15 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li ><a href="views.html" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -592,6 +593,9 @@ <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li > Modified: phoenix/site/publish/udf.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/udf.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/udf.html (original) +++ phoenix/site/publish/udf.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-15 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li ><a href="views.html" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -473,6 +474,9 @@ Connection conn = DriverManager.getConne <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li > Modified: phoenix/site/publish/update_statistics.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/update_statistics.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/update_statistics.html (original) +++ phoenix/site/publish/update_statistics.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-15 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li ><a href="views.html" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -394,6 +395,9 @@ <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li > Modified: phoenix/site/publish/upgrading.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/upgrading.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/upgrading.html (original) +++ phoenix/site/publish/upgrading.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-15 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li ><a href="views.html" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -355,6 +356,9 @@ <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li > Modified: phoenix/site/publish/views.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/views.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/views.html (original) +++ phoenix/site/publish/views.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-15 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li class="active"><a href="" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -384,6 +385,9 @@ VALUES('John Doe', CURRENT_DATE(), NEXT <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li class="active"> Modified: phoenix/site/publish/who_is_using.html URL: http://svn.apache.org/viewvc/phoenix/site/publish/who_is_using.html?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/publish/who_is_using.html (original) +++ phoenix/site/publish/who_is_using.html Wed Oct 4 07:09:48 2017 @@ -1,7 +1,7 @@ <!DOCTYPE html> <!-- - Generated by Apache Maven Doxia at 2017-09-15 + Generated by Apache Maven Doxia at 2017-10-04 Rendered using Reflow Maven Skin 1.1.0 (http://andriusvelykis.github.io/reflow-maven-skin) --> <html xml:lang="en" lang="en"> @@ -115,6 +115,7 @@ <li ><a href="paged.html" title="Paged Queries">Paged Queries</a></li> <li ><a href="salted.html" title="Salted Tables">Salted Tables</a></li> <li ><a href="skip_scan.html" title="Skip Scan">Skip Scan</a></li> + <li ><a href="tablesample.html" title="Table Sampling">Table Sampling</a></li> <li class="divider"/> <li ><a href="views.html" title="Views">Views</a></li> <li ><a href="multi-tenancy.html" title="Multi tenancy">Multi tenancy</a></li> @@ -437,6 +438,9 @@ <a href="skip_scan.html" title="Skip Scan">Skip Scan</a> </li> <li > + <a href="tablesample.html" title="Table Sampling">Table Sampling</a> + </li> + <li > <a href="http:divider" title=""></a> </li> <li > Modified: phoenix/site/source/src/site/markdown/recent.md URL: http://svn.apache.org/viewvc/phoenix/site/source/src/site/markdown/recent.md?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/source/src/site/markdown/recent.md (original) +++ phoenix/site/source/src/site/markdown/recent.md Wed Oct 4 07:09:48 2017 @@ -2,6 +2,7 @@ As items are implemented from our road map, they are moved here to track the progress we've made: +1. **[Table Sampling](tablesample.html)**. Support the <code>TABLESAMPLE</code> clause by implementing a filter that uses the guideposts established by stats gathering to only return a percentage of the rows. **Available in our 4.12 release** 1. **[Reduce on disk storage](https://phoenix.apache.org/columnencoding.html)**. Reduce on disk storage to improve performance by a) packing all values into a single cell per column family and b) provide an indirection between the column name and the column qualifier. **Available in our 4.10 release** 1. **[Atomic update](https://phoenix.apache.org/atomic_upsert.html)**. Atomic update is now possible in the UPSERT VALUES statement in support of counters and other use cases. **Available in our 4.9 release** 6. **[DEFAULT declaration](https://phoenix.apache.org/language/index.html#column_def)**. When defining a column it is now possible to provide a DEFAULT declaration for the initial value. **Available in our 4.9 release** Modified: phoenix/site/source/src/site/markdown/roadmap.md URL: http://svn.apache.org/viewvc/phoenix/site/source/src/site/markdown/roadmap.md?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/source/src/site/markdown/roadmap.md (original) +++ phoenix/site/source/src/site/markdown/roadmap.md Wed Oct 4 07:09:48 2017 @@ -14,7 +14,6 @@ Our roadmap is driven by our user commun 7. **[STRUCT type](https://issues.apache.org/jira/browse/PHOENIX-477)**. Allow declarations of multiple fields of different data types that would be packed into a single cell to reduce the per column storage overhead. 17. **[OLAP Extensions](https://issues.apache.org/jira/browse/PHOENIX-154)**. Support the `WINDOW`, `PARTITION OVER`, `RANK`, and other SQL-92 extensions. 16. **[Multi-version Row Queries](https://issues.apache.org/jira/browse/PHOENIX-590)**. Expose the time dimension of rows through a built-in function to allow aggregation and trending over multiple row versions. -18. **[Table Sampling](https://issues.apache.org/jira/browse/PHOENIX-153)**. Support the <code>TABLESAMPLE</code> clause by implementing a filter that uses the guideposts established by stats gathering to only return n rows per region. 14. **Security Features**. A number of existing HBase security features in 0.94 could be leverage and new security features being added to 0.98 could be leveraged in the future. * **[Support GRANT and REVOKE](https://issues.apache.org/jira/browse/PHOENIX-672)**. Support the standard GRANT and REVOKE SQL commands through an HBase AccessController. * **[Surface support for encryption](https://issues.apache.org/jira/browse/PHOENIX-673)**. Surface specification of what should be encrypted now that HBase supports transparent encryption. Added: phoenix/site/source/src/site/markdown/tablesample.md URL: http://svn.apache.org/viewvc/phoenix/site/source/src/site/markdown/tablesample.md?rev=1811040&view=auto ============================================================================== --- phoenix/site/source/src/site/markdown/tablesample.md (added) +++ phoenix/site/source/src/site/markdown/tablesample.md Wed Oct 4 07:09:48 2017 @@ -0,0 +1,80 @@ +# Table Sampliing + +To support table sampling, similar to the Postgresql and T-SQL syntax, a TABLESAMPLE clause has been +encorporated into the table aliases ref as of Phoenix 4.12. The general syntax is described +[here](language/index.html#aliased_table_ref). This feature limits the number of rows returned from +a table to a PERCENT of rows. See [PHOENIX-153](https://issues.apache.org/jira/browse/PHOENIX-153) for +implementation details. + +This feature is implmented with a Bernoulli trial, a consistent-hashing based tablesampler, to achieve a +Bernoulli sampling on a given population (rows). When executed, given a sampling rate, it leverages Phoenix's +stats as well HBase's region distribution to perform a table sampling. + +As part of the [statistics collection](update_statistics.html), a guide post (a row referrence) is created for +rows at equidistant bytes from each other. When sampling is required, a Bernoulli trial process is applied +repeatedly on each guide post on each region with a probability that porportional to the sampling rate. A +included guide post will result in all the rows in between this and next guide post being included into +sample population. + +#### Performance +Sampling on a table with a sampling rate = 100% will cost same amount of computational resource as a query without sampling. +This resouces consumption quicky fall off when a smaller sampling rate is choosing. In general, +the amortized complexity for sampling process is `O(k + mn)` , with n being the number of the regions of the sampled hbase table, +m being the number of guide posts, and k being the size of sampled population. + + +#### Repeatable +Repeatable means it guarantees that the repeated sampling on the same table will resulted in the same sampled result. + +Repeatable feature is made possible by a consistent-hashing process applied on the binary representation of the +start rowkey of each guideposts of each region when sampling process is in action. During hashing, +a FNV1 implementation with lazy mod method is adopted by default. see [FNV1](http://www.isthe.com/chongo/tech/comp/fnv/). + + +## Examples + +For example, to tablesample a table , you would execute the following command. +Please note, the sampling rate is a numeric value between 0 and 100, inclusive. + + select * from PERSON TABLESAMPLE(12.08); + +Some more examples: + + select * from PERSON TABLESAMPLE (12.08) where ADDRESS = 'CA' OR name>'aaa'; + select count( * ) from PERSON TABLESAMPLE (12.08) LIMIT 2 + select count( * ) from (select NAME from PERSON TABLESAMPLE (49) limit 20) + select * from (select /+NO_INDEX/ * from PERSON tablesample (10) where Name > 'tina10') where ADDRESS = 'CA' + select * from PERSON1, PERSON2 tablesample (70) where PERSON1.Name = PERSON2.NAME + select /+NO_INDEX/ count( * ) from PERSON tableSample (19), US_POPULATION tableSample (28) where PERSON.Name > US_POPULATION.STATE + upsert into personbig(ID, ADDRESS) select id, address from personbig tablesample (1); + +To use it in aggregation: + + select count( * ) from PERSON TABLESAMPLE (49) LIMIT 2 + select count( * ) from (select NAME from PERSON TABLESAMPLE (49) limit 20) + +To explain the query: + + explain select count( * ) from PERSON TABLESAMPLE (49) LIMIT 2) + + + +## Tuning + +* Due to sampling process, the TABLESAMPLE clause should be used with some caution. +For example, a join of two tables is likely to return a match for each row in both tables; +however, when tablesampling process is applied on either one or both of the two tables, the joining +may not match the expectation when without sampling. + +* The stats needs to be collected in order to achieve the best sampling accuracy. To turn on statistics collection, refer to [Statistics Collection](update_statistics.html). + + <pre> + ALTER TABLE my_table SET GUIDE_POSTS_WIDTH = 10000000 + </pre> +* A denser guide post setting improves the sampling accuracy, but reduces the performance at the same time. A comparison +is experimented as fig below: + + + + + Added: phoenix/site/source/src/site/resources/images/sampling_accuracy_perf.jpg URL: http://svn.apache.org/viewvc/phoenix/site/source/src/site/resources/images/sampling_accuracy_perf.jpg?rev=1811040&view=auto ============================================================================== Binary file - no diff available. Propchange: phoenix/site/source/src/site/resources/images/sampling_accuracy_perf.jpg ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: phoenix/site/source/src/site/site.xml URL: http://svn.apache.org/viewvc/phoenix/site/source/src/site/site.xml?rev=1811040&r1=1811039&r2=1811040&view=diff ============================================================================== --- phoenix/site/source/src/site/site.xml (original) +++ phoenix/site/source/src/site/site.xml Wed Oct 4 07:09:48 2017 @@ -111,6 +111,7 @@ <item href="paged.html" name="Paged Queries"/> <item href="salted.html" name="Salted Tables"/> <item href="skip_scan.html" name="Skip Scan"/> + <item href="tablesample.html" name="Table Sampling"/> <item href="http:divider" name=""/> <item href="views.html" name="Views"/> <item href="multi-tenancy.html" name="Multi tenancy"/>
