http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_double.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_double.xml b/docs/topics/impala_double.xml new file mode 100644 index 0000000..f1d1756 --- /dev/null +++ b/docs/topics/impala_double.xml @@ -0,0 +1,100 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="double"> + + <title>DOUBLE Data Type</title> + <titlealts><navtitle>DOUBLE</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Data Types"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Data Analysts"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Schemas"/> + </metadata> + </prolog> + + <conbody> + + <p> + A double precision floating-point data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER + TABLE</codeph> statements. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + + <p> + In the column definition of a <codeph>CREATE TABLE</codeph> statement: + </p> + +<codeblock><varname>column_name</varname> DOUBLE</codeblock> + + <p> + <b>Range:</b> 4.94065645841246544e-324d .. 1.79769313486231570e+308, positive or negative + </p> + + <p> + <b>Precision:</b> 15 to 17 significant digits, depending on usage. The number of significant digits does + not depend on the position of the decimal point. + </p> + + <p> + <b>Representation:</b> The values are stored in 8 bytes, using + <xref href="https://en.wikipedia.org/wiki/Double-precision_floating-point_format" scope="external" format="html">IEEE 754 Double Precision Binary Floating Point</xref> format. + </p> + + <p> + <b>Conversions:</b> Impala does not automatically convert <codeph>DOUBLE</codeph> to any other type. You can + use <codeph>CAST()</codeph> to convert <codeph>DOUBLE</codeph> values to <codeph>FLOAT</codeph>, + <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, <codeph>INT</codeph>, <codeph>BIGINT</codeph>, + <codeph>STRING</codeph>, <codeph>TIMESTAMP</codeph>, or <codeph>BOOLEAN</codeph>. You can use exponential + notation in <codeph>DOUBLE</codeph> literals or when casting from <codeph>STRING</codeph>, for example + <codeph>1.0e6</codeph> to represent one million. + <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/> + </p> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + The data type <codeph>REAL</codeph> is an alias for <codeph>DOUBLE</codeph>. + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + +<codeblock>CREATE TABLE t1 (x DOUBLE); +SELECT CAST(1000.5 AS DOUBLE); +</codeblock> + + <p conref="../shared/impala_common.xml#common/partitioning_imprecise"/> + + <p conref="../shared/impala_common.xml#common/hbase_ok"/> + + <p conref="../shared/impala_common.xml#common/parquet_ok"/> + + <p conref="../shared/impala_common.xml#common/text_bulky"/> + +<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> --> + + <p conref="../shared/impala_common.xml#common/internals_8_bytes"/> + +<!-- <p conref="/Content/impala_common_xi44078.xml#common/added_in_20"/> --> + + <p conref="../shared/impala_common.xml#common/column_stats_constant"/> + + <p conref="../shared/impala_common.xml#common/restrictions_blurb"/> + +<!-- This conref appears under SUM(), AVG(), FLOAT, and DOUBLE topics. --> + + <p conref="../shared/impala_common.xml#common/sum_double"/> + + <p conref="../shared/impala_common.xml#common/float_double_decimal_caveat"/> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_math_functions.xml#math_functions"/>, + <xref href="impala_float.xml#float"/> + </p> + </conbody> +</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_database.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_drop_database.xml b/docs/topics/impala_drop_database.xml new file mode 100644 index 0000000..c6a1b64 --- /dev/null +++ b/docs/topics/impala_drop_database.xml @@ -0,0 +1,124 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="drop_database"> + + <title>DROP DATABASE Statement</title> + <titlealts><navtitle>DROP DATABASE</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Databases"/> + <data name="Category" value="DDL"/> + <data name="Category" value="Schemas"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">DROP DATABASE statement</indexterm> + Removes a database from the system. The physical operations involve removing the metadata for the database + from the metastore, and deleting the corresponding <codeph>*.db</codeph> directory from HDFS. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>DROP (DATABASE|SCHEMA) [IF EXISTS] <varname>database_name</varname> <ph rev="2.3.0">[RESTRICT | CASCADE]</ph>;</codeblock> + + <p conref="../shared/impala_common.xml#common/ddl_blurb"/> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + By default, the database must be empty before it can be dropped, to avoid losing any data. + </p> + + <p rev="2.3.0"> + In CDH 5.5 / Impala 2.3 and higher, you can include the <codeph>CASCADE</codeph> + clause to make Impala drop all tables and other objects in the database before dropping the database itself. + The <codeph>RESTRICT</codeph> clause enforces the original requirement that the database be empty + before being dropped. Because the <codeph>RESTRICT</codeph> behavior is still the default, this + clause is optional. + </p> + + <p rev="2.3.0"> + The automatic dropping resulting from the <codeph>CASCADE</codeph> clause follows the same rules as the + corresponding <codeph>DROP TABLE</codeph>, <codeph>DROP VIEW</codeph>, and <codeph>DROP FUNCTION</codeph> statements. + In particular, the HDFS directories and data files for any external tables are left behind when the + tables are removed. + </p> + + <p> + When you do not use the <codeph>CASCADE</codeph> clause, drop or move all the objects inside the database manually + before dropping the database itself: + </p> + + <ul> + <li> + <p> + Use the <codeph>SHOW TABLES</codeph> statement to locate all tables and views in the database, + and issue <codeph>DROP TABLE</codeph> and <codeph>DROP VIEW</codeph> statements to remove them all. + </p> + </li> + <li> + <p> + Use the <codeph>SHOW FUNCTIONS</codeph> and <codeph>SHOW AGGREGATE FUNCTIONS</codeph> statements + to locate all user-defined functions in the database, and issue <codeph>DROP FUNCTION</codeph> + and <codeph>DROP AGGREGATE FUNCTION</codeph> statements to remove them all. + </p> + </li> + <li> + <p> + To keep tables or views contained by a database while removing the database itself, use + <codeph>ALTER TABLE</codeph> and <codeph>ALTER VIEW</codeph> to move the relevant + objects to a different database before dropping the original database. + </p> + </li> + </ul> + + <p> + You cannot drop the current database, that is, the database your session connected to + either through the <codeph>USE</codeph> statement or the <codeph>-d</codeph> option of <cmdname>impala-shell</cmdname>. + Issue a <codeph>USE</codeph> statement to switch to a different database first. + Because the <codeph>default</codeph> database is always available, issuing + <codeph>USE default</codeph> is a convenient way to leave the current database + before dropping it. + </p> + + <p conref="../shared/impala_common.xml#common/hive_blurb"/> + + <p> + When you drop a database in Impala, the database can no longer be used by Hive. + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + +<!-- Better to conref the same examples in both places. --> + + <p> + See <xref href="impala_create_database.xml#create_database"/> for examples covering <codeph>CREATE + DATABASE</codeph>, <codeph>USE</codeph>, and <codeph>DROP DATABASE</codeph>. + </p> + + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/permissions_blurb"/> + <p rev="CDH-19187"> + The user ID that the <cmdname>impalad</cmdname> daemon runs under, + typically the <codeph>impala</codeph> user, must have write + permission for the directory associated with the database. + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + + <codeblock conref="../shared/impala_common.xml#common/create_drop_db_example"/> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_databases.xml#databases"/>, <xref href="impala_create_database.xml#create_database"/>, + <xref href="impala_use.xml#use"/>, <xref href="impala_show.xml#show_databases"/>, <xref href="impala_drop_table.xml#drop_table"/> + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_function.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_drop_function.xml b/docs/topics/impala_drop_function.xml new file mode 100644 index 0000000..51a4d90 --- /dev/null +++ b/docs/topics/impala_drop_function.xml @@ -0,0 +1,60 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept rev="1.2" id="drop_function"> + + <title>DROP FUNCTION Statement</title> + <titlealts><navtitle>DROP FUNCTION</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="SQL"/> + <data name="Category" value="DDL"/> + <data name="Category" value="Impala Functions"/> + <data name="Category" value="UDFs"/> + <data name="Category" value="Schemas"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">DROP FUNCTION statement</indexterm> + Removes a user-defined function (UDF), so that it is not available for execution during Impala + <codeph>SELECT</codeph> or <codeph>INSERT</codeph> operations. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>DROP [AGGREGATE] FUNCTION [IF EXISTS] [<varname>db_name</varname>.]<varname>function_name</varname>(<varname>type</varname>[, <varname>type</varname>...])</codeblock> + + <p conref="../shared/impala_common.xml#common/ddl_blurb"/> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + Because the same function name could be overloaded with different argument signatures, you specify the + argument types to identify the exact function to drop. + </p> + + <p conref="../shared/impala_common.xml#common/restrictions_blurb"/> + + <p conref="../shared/impala_common.xml#common/udf_persistence_restriction"/> + + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/permissions_blurb"/> + <p rev="CDH-19187"> + The user ID that the <cmdname>impalad</cmdname> daemon runs under, + typically the <codeph>impala</codeph> user, does not need any + particular HDFS permissions to perform this statement. + All read and write operations are on the metastore database, + not HDFS files and directories. + </p> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_udf.xml#udfs"/>, <xref href="impala_create_function.xml#create_function"/> + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_role.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_drop_role.xml b/docs/topics/impala_drop_role.xml new file mode 100644 index 0000000..35d2157 --- /dev/null +++ b/docs/topics/impala_drop_role.xml @@ -0,0 +1,67 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept rev="1.4.0" id="drop_role"> + + <title>DROP ROLE Statement (CDH 5.2 or higher only)</title> + <titlealts><navtitle>DROP ROLE (CDH 5.2 or higher only)</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="DDL"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Sentry"/> + <data name="Category" value="Roles"/> + <!-- Consider whether to go deeper into categories like Security for the Sentry-related statements. --> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">DROP ROLE statement</indexterm> +<!-- Copied from Sentry docs. Turn into conref. I did some rewording for clarity. --> + The <codeph>DROP ROLE</codeph> statement removes a role from the metastore database. Once dropped, the role + is revoked for all users to whom it was previously assigned, and all privileges granted to that role are + revoked. Queries that are already executing are not affected. Impala verifies the role information + approximately every 60 seconds, so the effects of <codeph>DROP ROLE</codeph> might not take effect for new + Impala queries for a brief period. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>DROP ROLE <varname>role_name</varname> +</codeblock> + + <p conref="../shared/impala_common.xml#common/privileges_blurb"/> + + <p> + Only administrative users (initially, a predefined set of users specified in the Sentry service configuration + file) can use this statement. + </p> + + <p conref="../shared/impala_common.xml#common/compatibility_blurb"/> + + <p> + Impala makes use of any roles and privileges specified by the <codeph>GRANT</codeph> and + <codeph>REVOKE</codeph> statements in Hive, and Hive makes use of any roles and privileges specified by the + <codeph>GRANT</codeph> and <codeph>REVOKE</codeph> statements in Impala. The Impala <codeph>GRANT</codeph> + and <codeph>REVOKE</codeph> statements for privileges do not require the <codeph>ROLE</codeph> keyword to be + repeated before each role name, unlike the equivalent Hive statements. + </p> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_authorization.xml#authorization"/>, <xref href="impala_grant.xml#grant"/> + <xref href="impala_revoke.xml#revoke"/>, <xref href="impala_create_role.xml#create_role"/>, + <xref href="impala_show.xml#show"/> + </p> + +<!-- To do: nail down the new SHOW syntax, e.g. SHOW ROLES, SHOW CURRENT ROLES, SHOW GROUPS. --> + + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/> + + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_stats.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_drop_stats.xml b/docs/topics/impala_drop_stats.xml new file mode 100644 index 0000000..56697f4 --- /dev/null +++ b/docs/topics/impala_drop_stats.xml @@ -0,0 +1,275 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept rev="2.1.0" id="drop_stats" xml:lang="en-US"> + + <title>DROP STATS Statement</title> + <titlealts><navtitle>DROP STATS</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="SQL"/> + <data name="Category" value="DDL"/> + <data name="Category" value="Tables"/> + <data name="Category" value="Performance"/> + <data name="Category" value="Scalability"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">DROP STATS statement</indexterm> + Removes the specified statistics from a table or partition. The statistics were originally created by the + <codeph>COMPUTE STATS</codeph> or <codeph>COMPUTE INCREMENTAL STATS</codeph> statement. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock rev="2.1.0">DROP STATS [<varname>database_name</varname>.]<varname>table_name</varname> +DROP INCREMENTAL STATS [<varname>database_name</varname>.]<varname>table_name</varname> PARTITION (<varname>partition_spec</varname>) + +<varname>partition_spec</varname> ::= <varname>partition_col</varname>=<varname>constant_value</varname> +</codeblock> + + <p conref="../shared/impala_common.xml#common/incremental_partition_spec"/> + + <p> + <codeph>DROP STATS</codeph> removes all statistics from the table, whether created by <codeph>COMPUTE + STATS</codeph> or <codeph>COMPUTE INCREMENTAL STATS</codeph>. + </p> + + <p rev="2.1.0"> + <codeph>DROP INCREMENTAL STATS</codeph> only affects incremental statistics for a single partition, specified + through the <codeph>PARTITION</codeph> clause. The incremental stats are marked as outdated, so that they are + recomputed by the next <codeph>COMPUTE INCREMENTAL STATS</codeph> statement. + </p> + +<!-- To do: what release was this added in? --> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + You typically use this statement when the statistics for a table or a partition have become stale due to data + files being added to or removed from the associated HDFS data directories, whether by manual HDFS operations + or <codeph>INSERT</codeph>, <codeph>INSERT OVERWRITE</codeph>, or <codeph>LOAD DATA</codeph> statements, or + adding or dropping partitions. + </p> + + <p> + When a table or partition has no associated statistics, Impala treats it as essentially zero-sized when + constructing the execution plan for a query. In particular, the statistics influence the order in which + tables are joined in a join query. To ensure proper query planning and good query performance and + scalability, make sure to run <codeph>COMPUTE STATS</codeph> or <codeph>COMPUTE INCREMENTAL STATS</codeph> on + the table or partition after removing any stale statistics. + </p> + + <p> + Dropping the statistics is not required for an unpartitioned table or a partitioned table covered by the + original type of statistics. A subsequent <codeph>COMPUTE STATS</codeph> statement replaces any existing + statistics with new ones, for all partitions, regardless of whether the old ones were outdated. Therefore, + this statement was rarely used before the introduction of incremental statistics. + </p> + + <p> + Dropping the statistics is required for a partitioned table containing incremental statistics, to make a + subsequent <codeph>COMPUTE INCREMENTAL STATS</codeph> statement rescan an existing partition. See + <xref href="impala_perf_stats.xml#perf_stats"/> for information about incremental statistics, a new feature + available in Impala 2.1.0 and higher. + </p> + + <p conref="../shared/impala_common.xml#common/ddl_blurb"/> + + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/permissions_blurb"/> + <p rev="CDH-19187"> + The user ID that the <cmdname>impalad</cmdname> daemon runs under, + typically the <codeph>impala</codeph> user, does not need any + particular HDFS permissions to perform this statement. + All read and write operations are on the metastore database, + not HDFS files and directories. + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + + <p> + The following example shows a partitioned table that has associated statistics produced by the + <codeph>COMPUTE INCREMENTAL STATS</codeph> statement, and how the situation evolves as statistics are dropped + from specific partitions, then the entire table. + </p> + + <p> + Initially, all table and column statistics are filled in. + </p> + +<!-- Note: chopped off any excess characters at position 87 and after, + to avoid weird wrapping in PDF. + Applies to any subsequent examples with output from SHOW ... STATS too. --> + +<codeblock>show table stats item_partitioned; ++-------------+-------+--------+----------+--------------+---------+------------------ +| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats ++-------------+-------+--------+----------+--------------+---------+------------------ +| Books | 1733 | 1 | 223.74KB | NOT CACHED | PARQUET | true +| Children | 1786 | 1 | 230.05KB | NOT CACHED | PARQUET | true +| Electronics | 1812 | 1 | 232.67KB | NOT CACHED | PARQUET | true +| Home | 1807 | 1 | 232.56KB | NOT CACHED | PARQUET | true +| Jewelry | 1740 | 1 | 223.72KB | NOT CACHED | PARQUET | true +| Men | 1811 | 1 | 231.25KB | NOT CACHED | PARQUET | true +| Music | 1860 | 1 | 237.90KB | NOT CACHED | PARQUET | true +| Shoes | 1835 | 1 | 234.90KB | NOT CACHED | PARQUET | true +| Sports | 1783 | 1 | 227.97KB | NOT CACHED | PARQUET | true +| Women | 1790 | 1 | 226.27KB | NOT CACHED | PARQUET | true +| Total | 17957 | 10 | 2.25MB | 0B | | ++-------------+-------+--------+----------+--------------+---------+------------------ +show column stats item_partitioned; ++------------------+-----------+------------------+--------+----------+--------------- +| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size ++------------------+-----------+------------------+--------+----------+--------------- +| i_item_sk | INT | 19443 | -1 | 4 | 4 +| i_item_id | STRING | 9025 | -1 | 16 | 16 +| i_rec_start_date | TIMESTAMP | 4 | -1 | 16 | 16 +| i_rec_end_date | TIMESTAMP | 3 | -1 | 16 | 16 +| i_item_desc | STRING | 13330 | -1 | 200 | 100.3028030395 +| i_current_price | FLOAT | 2807 | -1 | 4 | 4 +| i_wholesale_cost | FLOAT | 2105 | -1 | 4 | 4 +| i_brand_id | INT | 965 | -1 | 4 | 4 +| i_brand | STRING | 725 | -1 | 22 | 16.17760086059 +| i_class_id | INT | 16 | -1 | 4 | 4 +| i_class | STRING | 101 | -1 | 15 | 7.767499923706 +| i_category_id | INT | 10 | -1 | 4 | 4 +| i_manufact_id | INT | 1857 | -1 | 4 | 4 +| i_manufact | STRING | 1028 | -1 | 15 | 11.32950019836 +| i_size | STRING | 8 | -1 | 11 | 4.334599971771 +| i_formulation | STRING | 12884 | -1 | 20 | 19.97999954223 +| i_color | STRING | 92 | -1 | 10 | 5.380899906158 +| i_units | STRING | 22 | -1 | 7 | 4.186900138854 +| i_container | STRING | 2 | -1 | 7 | 6.992599964141 +| i_manager_id | INT | 105 | -1 | 4 | 4 +| i_product_name | STRING | 19094 | -1 | 25 | 18.02330017089 +| i_category | STRING | 10 | 0 | -1 | -1 ++------------------+-----------+------------------+--------+----------+--------------- +</codeblock> + + <p> + To remove statistics for particular partitions, use the <codeph>DROP INCREMENTAL STATS</codeph> statement. + After removing statistics for two partitions, the table-level statistics reflect that change in the + <codeph>#Rows</codeph> and <codeph>Incremental stats</codeph> fields. The counts, maximums, and averages of + the column-level statistics are unaffected. + </p> + + <note> + (It is possible that the row count might be preserved in future after a <codeph>DROP INCREMENTAL + STATS</codeph> statement. Check the resolution of the issue + <xref href="https://issues.cloudera.org/browse/IMPALA-1615" scope="external" format="html">IMPALA-1615</xref>.) + </note> + +<codeblock>drop incremental stats item_partitioned partition (i_category='Sports'); +drop incremental stats item_partitioned partition (i_category='Electronics'); + +show table stats item_partitioned ++-------------+-------+--------+----------+--------------+---------+------------------ +| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats ++-------------+-------+--------+----------+--------------+---------+------------------ +| Books | 1733 | 1 | 223.74KB | NOT CACHED | PARQUET | true +| Children | 1786 | 1 | 230.05KB | NOT CACHED | PARQUET | true +| Electronics | -1 | 1 | 232.67KB | NOT CACHED | PARQUET | false +| Home | 1807 | 1 | 232.56KB | NOT CACHED | PARQUET | true +| Jewelry | 1740 | 1 | 223.72KB | NOT CACHED | PARQUET | true +| Men | 1811 | 1 | 231.25KB | NOT CACHED | PARQUET | true +| Music | 1860 | 1 | 237.90KB | NOT CACHED | PARQUET | true +| Shoes | 1835 | 1 | 234.90KB | NOT CACHED | PARQUET | true +| Sports | -1 | 1 | 227.97KB | NOT CACHED | PARQUET | false +| Women | 1790 | 1 | 226.27KB | NOT CACHED | PARQUET | true +| Total | 17957 | 10 | 2.25MB | 0B | | ++-------------+-------+--------+----------+--------------+---------+------------------ +show column stats item_partitioned ++------------------+-----------+------------------+--------+----------+--------------- +| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size ++------------------+-----------+------------------+--------+----------+--------------- +| i_item_sk | INT | 19443 | -1 | 4 | 4 +| i_item_id | STRING | 9025 | -1 | 16 | 16 +| i_rec_start_date | TIMESTAMP | 4 | -1 | 16 | 16 +| i_rec_end_date | TIMESTAMP | 3 | -1 | 16 | 16 +| i_item_desc | STRING | 13330 | -1 | 200 | 100.3028030395 +| i_current_price | FLOAT | 2807 | -1 | 4 | 4 +| i_wholesale_cost | FLOAT | 2105 | -1 | 4 | 4 +| i_brand_id | INT | 965 | -1 | 4 | 4 +| i_brand | STRING | 725 | -1 | 22 | 16.17760086059 +| i_class_id | INT | 16 | -1 | 4 | 4 +| i_class | STRING | 101 | -1 | 15 | 7.767499923706 +| i_category_id | INT | 10 | -1 | 4 | 4 +| i_manufact_id | INT | 1857 | -1 | 4 | 4 +| i_manufact | STRING | 1028 | -1 | 15 | 11.32950019836 +| i_size | STRING | 8 | -1 | 11 | 4.334599971771 +| i_formulation | STRING | 12884 | -1 | 20 | 19.97999954223 +| i_color | STRING | 92 | -1 | 10 | 5.380899906158 +| i_units | STRING | 22 | -1 | 7 | 4.186900138854 +| i_container | STRING | 2 | -1 | 7 | 6.992599964141 +| i_manager_id | INT | 105 | -1 | 4 | 4 +| i_product_name | STRING | 19094 | -1 | 25 | 18.02330017089 +| i_category | STRING | 10 | 0 | -1 | -1 ++------------------+-----------+------------------+--------+----------+--------------- +</codeblock> + + <p> + To remove all statistics from the table, whether produced by <codeph>COMPUTE STATS</codeph> or + <codeph>COMPUTE INCREMENTAL STATS</codeph>, use the <codeph>DROP STATS</codeph> statement without the + <codeph>INCREMENTAL</codeph> clause). Now, both table-level and column-level statistics are reset. + </p> + +<codeblock>drop stats item_partitioned; + +show table stats item_partitioned ++-------------+-------+--------+----------+--------------+---------+------------------ +| i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats ++-------------+-------+--------+----------+--------------+---------+------------------ +| Books | -1 | 1 | 223.74KB | NOT CACHED | PARQUET | false +| Children | -1 | 1 | 230.05KB | NOT CACHED | PARQUET | false +| Electronics | -1 | 1 | 232.67KB | NOT CACHED | PARQUET | false +| Home | -1 | 1 | 232.56KB | NOT CACHED | PARQUET | false +| Jewelry | -1 | 1 | 223.72KB | NOT CACHED | PARQUET | false +| Men | -1 | 1 | 231.25KB | NOT CACHED | PARQUET | false +| Music | -1 | 1 | 237.90KB | NOT CACHED | PARQUET | false +| Shoes | -1 | 1 | 234.90KB | NOT CACHED | PARQUET | false +| Sports | -1 | 1 | 227.97KB | NOT CACHED | PARQUET | false +| Women | -1 | 1 | 226.27KB | NOT CACHED | PARQUET | false +| Total | -1 | 10 | 2.25MB | 0B | | ++-------------+-------+--------+----------+--------------+---------+------------------ +show column stats item_partitioned ++------------------+-----------+------------------+--------+----------+----------+ +| Column | Type | #Distinct Values | #Nulls | Max Size | Avg Size | ++------------------+-----------+------------------+--------+----------+----------+ +| i_item_sk | INT | -1 | -1 | 4 | 4 | +| i_item_id | STRING | -1 | -1 | -1 | -1 | +| i_rec_start_date | TIMESTAMP | -1 | -1 | 16 | 16 | +| i_rec_end_date | TIMESTAMP | -1 | -1 | 16 | 16 | +| i_item_desc | STRING | -1 | -1 | -1 | -1 | +| i_current_price | FLOAT | -1 | -1 | 4 | 4 | +| i_wholesale_cost | FLOAT | -1 | -1 | 4 | 4 | +| i_brand_id | INT | -1 | -1 | 4 | 4 | +| i_brand | STRING | -1 | -1 | -1 | -1 | +| i_class_id | INT | -1 | -1 | 4 | 4 | +| i_class | STRING | -1 | -1 | -1 | -1 | +| i_category_id | INT | -1 | -1 | 4 | 4 | +| i_manufact_id | INT | -1 | -1 | 4 | 4 | +| i_manufact | STRING | -1 | -1 | -1 | -1 | +| i_size | STRING | -1 | -1 | -1 | -1 | +| i_formulation | STRING | -1 | -1 | -1 | -1 | +| i_color | STRING | -1 | -1 | -1 | -1 | +| i_units | STRING | -1 | -1 | -1 | -1 | +| i_container | STRING | -1 | -1 | -1 | -1 | +| i_manager_id | INT | -1 | -1 | 4 | 4 | +| i_product_name | STRING | -1 | -1 | -1 | -1 | +| i_category | STRING | 10 | 0 | -1 | -1 | ++------------------+-----------+------------------+--------+----------+----------+ +</codeblock> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_compute_stats.xml#compute_stats"/>, <xref href="impala_show.xml#show_table_stats"/>, + <xref href="impala_show.xml#show_column_stats"/>, <xref href="impala_perf_stats.xml#perf_stats"/> + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_table.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_drop_table.xml b/docs/topics/impala_drop_table.xml new file mode 100644 index 0000000..33cb726 --- /dev/null +++ b/docs/topics/impala_drop_table.xml @@ -0,0 +1,142 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="drop_table"> + + <title>DROP TABLE Statement</title> + <titlealts><navtitle>DROP TABLE</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="SQL"/> + <data name="Category" value="DDL"/> + <data name="Category" value="Tables"/> + <data name="Category" value="Schemas"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">DROP TABLE statement</indexterm> + Removes an Impala table. Also removes the underlying HDFS data files for internal tables, although not for + external tables. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>DROP TABLE [IF EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> <ph rev="2.3.0">[PURGE]</ph></codeblock> + + <p> + <b>IF EXISTS clause:</b> + </p> + + <p> + The optional <codeph>IF EXISTS</codeph> clause makes the statement succeed whether or not the table exists. + If the table does exist, it is dropped; if it does not exist, the statement has no effect. This capability is + useful in standardized setup scripts that remove existing schema objects and create new ones. By using some + combination of <codeph>IF EXISTS</codeph> for the <codeph>DROP</codeph> statements and <codeph>IF NOT + EXISTS</codeph> clauses for the <codeph>CREATE</codeph> statements, the script can run successfully the first + time you run it (when the objects do not exist yet) and subsequent times (when some or all of the objects do + already exist). + </p> + + <p rev="2.3.0"> + <b>PURGE clause:</b> + </p> + + <p rev="2.3.0"> + The optional <codeph>PURGE</codeph> keyword, available in CDH 5.5 / Impala 2.3 and higher, + causes Impala to remove the associated HDFS data files + immediately, rather than going through the HDFS trashcan mechanism. Use this keyword when dropping + a table if it is crucial to remove the data as quickly as possible to free up space, or if there is + a problem with the trashcan, such as the trashcan not being configured or being in a different + HDFS encryption zone than the data files. + </p> + + <p conref="../shared/impala_common.xml#common/ddl_blurb"/> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + By default, Impala removes the associated HDFS directory and data files for the table. If you issue a + <codeph>DROP TABLE</codeph> and the data files are not deleted, it might be for the following reasons: + </p> + + <ul> + <li> + If the table was created with the + <codeph><xref href="impala_tables.xml#external_tables">EXTERNAL</xref></codeph> clause, Impala leaves all + files and directories untouched. Use external tables when the data is under the control of other Hadoop + components, and Impala is only used to query the data files from their original locations. + </li> + + <li> + Impala might leave the data files behind unintentionally, if there is no HDFS location available to hold + the HDFS trashcan for the <codeph>impala</codeph> user. See + <xref href="impala_prereqs.xml#prereqs_account"/> for the procedure to set up the required HDFS home + directory. + </li> + </ul> + + <p> + Make sure that you are in the correct database before dropping a table, either by issuing a + <codeph>USE</codeph> statement first or by using a fully qualified name + <codeph><varname>db_name</varname>.<varname>table_name</varname></codeph>. + </p> + + <p> + If you intend to issue a <codeph>DROP DATABASE</codeph> statement, first issue <codeph>DROP TABLE</codeph> + statements to remove all the tables in that database. + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + +<codeblock>create database temporary; +use temporary; +create table unimportant (x int); +create table trivial (s string); +-- Drop a table in the current database. +drop table unimportant; +-- Switch to a different database. +use default; +-- To drop a table in a different database... +drop table trivial; +<i>ERROR: AnalysisException: Table does not exist: default.trivial</i> +-- ...use a fully qualified name. +drop table temporary.trivial;</codeblock> + + <p conref="../shared/impala_common.xml#common/disk_space_blurb"/> + + <p conref="../shared/impala_common.xml#common/s3_blurb"/> + <p rev="2.2.0"> + Although Impala cannot write new data to a table stored in the Amazon + S3 filesystem, the <codeph>DROP TABLE</codeph> statement can remove data files from S3 + if the associated S3 table is an internal table. + See <xref href="impala_s3.xml#s3"/> for details about working with S3 tables. + </p> + + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/permissions_blurb"/> + <p rev="CDH-19187"> + For an internal table, the user ID that the <cmdname>impalad</cmdname> daemon runs under, + typically the <codeph>impala</codeph> user, must have write + permission for all the files and directories that make up the table. + </p> + <p> + For an external table, dropping the table only involves changes to metadata in the metastore database. + Because Impala does not remove any HDFS files or directories when external tables are dropped, + no particular permissions are needed for the associated HDFS files or directories. + </p> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_tables.xml#tables"/>, + <xref href="impala_alter_table.xml#alter_table"/>, <xref href="impala_create_table.xml#create_table"/>, + <xref href="impala_partitioning.xml#partitioning"/>, <xref href="impala_tables.xml#internal_tables"/>, + <xref href="impala_tables.xml#external_tables"/> + </p> + + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_drop_view.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_drop_view.xml b/docs/topics/impala_drop_view.xml new file mode 100644 index 0000000..edcab58 --- /dev/null +++ b/docs/topics/impala_drop_view.xml @@ -0,0 +1,48 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept rev="1.1" id="drop_view"> + + <title>DROP VIEW Statement</title> + <titlealts><navtitle>DROP VIEW</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="SQL"/> + <data name="Category" value="DDL"/> + <data name="Category" value="Schemas"/> + <data name="Category" value="Tables"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">DROP VIEW statement</indexterm> + Removes the specified view, which was originally created by the <codeph>CREATE VIEW</codeph> statement. + Because a view is purely a logical construct (an alias for a query) with no physical data behind it, + <codeph>DROP VIEW</codeph> only involves changes to metadata in the metastore database, not any data files in + HDFS. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>DROP VIEW [IF EXISTS] [<varname>db_name</varname>.]<varname>view_name</varname></codeblock> + + <p conref="../shared/impala_common.xml#common/ddl_blurb"/> + + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + + <p conref="../shared/impala_common.xml#common/create_drop_view_examples"/> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_views.xml#views"/>, <xref href="impala_create_view.xml#create_view"/>, + <xref href="impala_alter_view.xml#alter_view"/> + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_exec_single_node_rows_threshold.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_exec_single_node_rows_threshold.xml b/docs/topics/impala_exec_single_node_rows_threshold.xml new file mode 100644 index 0000000..fa3007d --- /dev/null +++ b/docs/topics/impala_exec_single_node_rows_threshold.xml @@ -0,0 +1,91 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept rev="2.0.0" id="exec_single_node_rows_threshold" xml:lang="en-US"> + + <title>EXEC_SINGLE_NODE_ROWS_THRESHOLD Query Option</title> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Scalability"/> + <data name="Category" value="Performance"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">EXEC_SINGLE_NODE_ROWS_THRESHOLD query option</indexterm> + This setting controls the cutoff point (in terms of number of rows scanned) below which Impala treats a query + as a <q>small</q> query, turning off optimizations such as parallel execution and native code generation. The + overhead for these optimizations is applicable for queries involving substantial amounts of data, but it + makes sense to skip them for queries involving tiny amounts of data. Reducing the overhead for small queries + allows Impala to complete them more quickly, keeping YARN resources, admission control slots, and so on + available for data-intensive queries. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>SET EXEC_SINGLE_NODE_ROWS_THRESHOLD=<varname>number_of_rows</varname></codeblock> + + <p> + <b>Type:</b> numeric + </p> + + <p> + <b>Default:</b> 100 + </p> + + <p> + <b>Usage notes:</b> Typically, you increase the default value to make this optimization apply to more queries. + If incorrect or corrupted table and column statistics cause Impala to apply this optimization + incorrectly to queries that actually involve substantial work, you might see the queries being slower as a + result of remote reads. In that case, recompute statistics with the <codeph>COMPUTE STATS</codeph> + or <codeph>COMPUTE INCREMENTAL STATS</codeph> statement. If there is a problem collecting accurate + statistics, you can turn this feature off by setting the value to -1. + </p> + + <p conref="../shared/impala_common.xml#common/internals_blurb"/> + + <p> + This setting applies to query fragments where the amount of data to scan can be accurately determined, either + through table and column statistics, or by the presence of a <codeph>LIMIT</codeph> clause. If Impala cannot + accurately estimate the size of the input data, this setting does not apply. + </p> + + <p rev="2.3.0"> + In CDH 5.5 / Impala 2.3 and higher, where Impala supports the complex data types <codeph>STRUCT</codeph>, + <codeph>ARRAY</codeph>, and <codeph>MAP</codeph>, if a query refers to any column of those types, + the small-query optimization is turned off for that query regardless of the + <codeph>EXEC_SINGLE_NODE_ROWS_THRESHOLD</codeph> setting. + </p> + + <p> + For a query that is determined to be <q>small</q>, all work is performed on the coordinator node. This might + result in some I/O being performed by remote reads. The savings from not distributing the query work and not + generating native code are expected to outweigh any overhead from the remote reads. + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + + <p> + A common use case is to query just a few rows from a table to inspect typical data values. In this example, + Impala does not parallelize the query or perform native code generation because the result set is guaranteed + to be smaller than the threshold value from this query option: + </p> + +<codeblock>SET EXEC_SINGLE_NODE_ROWS_THRESHOLD=500; +SELECT * FROM enormous_table LIMIT 300; +</codeblock> + +<!-- Don't have any other places that tie into this particular optimization technique yet. +Potentially: conceptual topics about code generation, distributed queries + +<p conref="/Content/impala_common_xi44078.xml#common/related_info"/> +<p> +</p> +--> + + </conbody> + +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_explain.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_explain.xml b/docs/topics/impala_explain.xml new file mode 100644 index 0000000..c9e8846 --- /dev/null +++ b/docs/topics/impala_explain.xml @@ -0,0 +1,224 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="explain"> + + <title>EXPLAIN Statement</title> + <titlealts><navtitle>EXPLAIN</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Querying"/> + <data name="Category" value="Reports"/> + <data name="Category" value="Planning"/> + <data name="Category" value="Performance"/> + <data name="Category" value="Troubleshooting"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">EXPLAIN statement</indexterm> + Returns the execution plan for a statement, showing the low-level mechanisms that Impala will use to read the + data, divide the work among nodes in the cluster, and transmit intermediate and final results across the + network. Use <codeph>explain</codeph> followed by a complete <codeph>SELECT</codeph> query. For example: + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>EXPLAIN { <varname>select_query</varname> | <varname>ctas_stmt</varname> | <varname>insert_stmt</varname> } +</codeblock> + + <p> + The <varname>select_query</varname> is a <codeph>SELECT</codeph> statement, optionally prefixed by a + <codeph>WITH</codeph> clause. See <xref href="impala_select.xml#select"/> for details. + </p> + + <p> + The <varname>insert_stmt</varname> is an <codeph>INSERT</codeph> statement that inserts into or overwrites an + existing table. It can use either the <codeph>INSERT ... SELECT</codeph> or <codeph>INSERT ... + VALUES</codeph> syntax. See <xref href="impala_insert.xml#insert"/> for details. + </p> + + <p> + The <varname>ctas_stmt</varname> is a <codeph>CREATE TABLE</codeph> statement using the <codeph>AS + SELECT</codeph> clause, typically abbreviated as a <q>CTAS</q> operation. See + <xref href="impala_create_table.xml#create_table"/> for details. + </p> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + You can interpret the output to judge whether the query is performing efficiently, and adjust the query + and/or the schema if not. For example, you might change the tests in the <codeph>WHERE</codeph> clause, add + hints to make join operations more efficient, introduce subqueries, change the order of tables in a join, add + or change partitioning for a table, collect column statistics and/or table statistics in Hive, or any other + performance tuning steps. + </p> + + <p> + The <codeph>EXPLAIN</codeph> output reminds you if table or column statistics are missing from any table + involved in the query. These statistics are important for optimizing queries involving large tables or + multi-table joins. See <xref href="impala_compute_stats.xml#compute_stats"/> for how to gather statistics, + and <xref href="impala_perf_stats.xml#perf_stats"/> for how to use this information for query tuning. + </p> + + <p conref="../shared/impala_common.xml#common/explain_interpret"/> + + <p> + If you come from a traditional database background and are not familiar with data warehousing, keep in mind + that Impala is optimized for full table scans across very large tables. The structure and distribution of + this data is typically not suitable for the kind of indexing and single-row lookups that are common in OLTP + environments. Seeing a query scan entirely through a large table is common, not necessarily an indication of + an inefficient query. Of course, if you can reduce the volume of scanned data by orders of magnitude, for + example by using a query that affects only certain partitions within a partitioned table, then you might be + able to optimize a query so that it executes in seconds rather than minutes. + </p> + + <p> + For more information and examples to help you interpret <codeph>EXPLAIN</codeph> output, see + <xref href="impala_explain_plan.xml#perf_explain"/>. + </p> + + <p rev="1.2"> + <b>Extended EXPLAIN output:</b> + </p> + + <p rev="1.2"> + For performance tuning of complex queries, and capacity planning (such as using the admission control and + resource management features), you can enable more detailed and informative output for the + <codeph>EXPLAIN</codeph> statement. In the <cmdname>impala-shell</cmdname> interpreter, issue the command + <codeph>SET EXPLAIN_LEVEL=<varname>level</varname></codeph>, where <varname>level</varname> is an integer + from 0 to 3 or corresponding mnemonic values <codeph>minimal</codeph>, <codeph>standard</codeph>, + <codeph>extended</codeph>, or <codeph>verbose</codeph>. + </p> + + <p rev="1.2"> + When extended <codeph>EXPLAIN</codeph> output is enabled, <codeph>EXPLAIN</codeph> statements print + information about estimated memory requirements, minimum number of virtual cores, and so on that you can use + to fine-tune the resource management options explained in + <xref href="impala_resource_management.xml#rm_options"/>. (The estimated memory requirements are + intentionally on the high side, to allow a margin for error, to avoid cancelling a query unnecessarily if you + set the <codeph>MEM_LIMIT</codeph> option to the estimated memory figure.) + </p> + + <p> + See <xref href="impala_explain_level.xml#explain_level"/> for details and examples. + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + + <p> + This example shows how the standard <codeph>EXPLAIN</codeph> output moves from the lowest (physical) level to + the higher (logical) levels. The query begins by scanning a certain amount of data; each node performs an + aggregation operation (evaluating <codeph>COUNT(*)</codeph>) on some subset of data that is local to that + node; the intermediate results are transmitted back to the coordinator node (labelled here as the + <codeph>EXCHANGE</codeph> node); lastly, the intermediate results are summed to display the final result. + </p> + +<codeblock id="explain_plan_simple">[impalad-host:21000] > explain select count(*) from customer_address; ++----------------------------------------------------------+ +| Explain String | ++----------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=42.00MB VCores=1 | +| | +| 03:AGGREGATE [MERGE FINALIZE] | +| | output: sum(count(*)) | +| | | +| 02:EXCHANGE [PARTITION=UNPARTITIONED] | +| | | +| 01:AGGREGATE | +| | output: count(*) | +| | | +| 00:SCAN HDFS [default.customer_address] | +| partitions=1/1 size=5.25MB | ++----------------------------------------------------------+ +</codeblock> + + <p> + These examples show how the extended <codeph>EXPLAIN</codeph> output becomes more accurate and informative as + statistics are gathered by the <codeph>COMPUTE STATS</codeph> statement. Initially, much of the information + about data size and distribution is marked <q>unavailable</q>. Impala can determine the raw data size, but + not the number of rows or number of distinct values for each column without additional analysis. The + <codeph>COMPUTE STATS</codeph> statement performs this analysis, so a subsequent <codeph>EXPLAIN</codeph> + statement has additional information to use in deciding how to optimize the distributed query. + </p> + + <draft-comment translate="no"> +Re-run these examples with more substantial tables populated with data. +</draft-comment> + +<codeblock rev="1.2">[localhost:21000] > set explain_level=extended; +EXPLAIN_LEVEL set to extended +[localhost:21000] > explain select x from t1; +[localhost:21000] > explain select x from t1; ++----------------------------------------------------------+ +| Explain String | ++----------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=32.00MB VCores=1 | +| | +| 01:EXCHANGE [PARTITION=UNPARTITIONED] | +| | hosts=1 per-host-mem=unavailable | +<b>| | tuple-ids=0 row-size=4B cardinality=unavailable |</b> +| | | +| 00:SCAN HDFS [default.t2, PARTITION=RANDOM] | +| partitions=1/1 size=36B | +<b>| table stats: unavailable |</b> +<b>| column stats: unavailable |</b> +| hosts=1 per-host-mem=32.00MB | +<b>| tuple-ids=0 row-size=4B cardinality=unavailable |</b> ++----------------------------------------------------------+ +</codeblock> + +<codeblock rev="1.2">[localhost:21000] > compute stats t1; ++-----------------------------------------+ +| summary | ++-----------------------------------------+ +| Updated 1 partition(s) and 1 column(s). | ++-----------------------------------------+ +[localhost:21000] > explain select x from t1; ++----------------------------------------------------------+ +| Explain String | ++----------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=64.00MB VCores=1 | +| | +| 01:EXCHANGE [PARTITION=UNPARTITIONED] | +| | hosts=1 per-host-mem=unavailable | +| | tuple-ids=0 row-size=4B cardinality=0 | +| | | +| 00:SCAN HDFS [default.t1, PARTITION=RANDOM] | +| partitions=1/1 size=36B | +<b>| table stats: 0 rows total |</b> +<b>| column stats: all |</b> +| hosts=1 per-host-mem=64.00MB | +<b>| tuple-ids=0 row-size=4B cardinality=0 |</b> ++----------------------------------------------------------+ +</codeblock> + + <p conref="../shared/impala_common.xml#common/security_blurb"/> + <p conref="../shared/impala_common.xml#common/redaction_yes"/> + + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/permissions_blurb"/> + <p rev="CDH-19187"> + <!-- Doublecheck these details. Does EXPLAIN really need any permissions? --> + The user ID that the <cmdname>impalad</cmdname> daemon runs under, + typically the <codeph>impala</codeph> user, must have read + and execute permissions for all applicable directories in all source tables + for the query that is being explained. + (A <codeph>SELECT</codeph> operation could read files from multiple different HDFS directories + if the source table is partitioned.) + </p> + + <p conref="../shared/impala_common.xml#common/related_info"/> + <p> + <xref href="impala_select.xml#select"/>, + <xref href="impala_insert.xml#insert"/>, + <xref href="impala_create_table.xml#create_table"/>, + <xref href="impala_explain_plan.xml#explain_plan"/> + </p> + + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_explain_level.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_explain_level.xml b/docs/topics/impala_explain_level.xml new file mode 100644 index 0000000..f54e8a8 --- /dev/null +++ b/docs/topics/impala_explain_level.xml @@ -0,0 +1,338 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept rev="1.2" id="explain_level"> + + <title>EXPLAIN_LEVEL Query Option</title> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Troubleshooting"/> + <data name="Category" value="Querying"/> + <data name="Category" value="Performance"/> + <data name="Category" value="Reports"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">EXPLAIN_LEVEL query option</indexterm> + Controls the amount of detail provided in the output of the <codeph>EXPLAIN</codeph> statement. The basic + output can help you identify high-level performance issues such as scanning a higher volume of data or more + partitions than you expect. The higher levels of detail show how intermediate results flow between nodes and + how different SQL operations such as <codeph>ORDER BY</codeph>, <codeph>GROUP BY</codeph>, joins, and + <codeph>WHERE</codeph> clauses are implemented within a distributed query. + </p> + + <p> + <b>Type:</b> <codeph>STRING</codeph> or <codeph>INT</codeph> + </p> + + <p> + <b>Default:</b> <codeph>1</codeph> + </p> + + <p> + <b>Arguments:</b> + </p> + + <p> + The allowed range of numeric values for this option is 0 to 3: + </p> + + <ul> + <li> + <codeph>0</codeph> or <codeph>MINIMAL</codeph>: A barebones list, one line per operation. Primarily useful + for checking the join order in very long queries where the regular <codeph>EXPLAIN</codeph> output is too + long to read easily. + </li> + + <li> + <codeph>1</codeph> or <codeph>STANDARD</codeph>: The default level of detail, showing the logical way that + work is split up for the distributed query. + </li> + + <li> + <codeph>2</codeph> or <codeph>EXTENDED</codeph>: Includes additional detail about how the query planner + uses statistics in its decision-making process, to understand how a query could be tuned by gathering + statistics, using query hints, adding or removing predicates, and so on. + </li> + + <li> + <codeph>3</codeph> or <codeph>VERBOSE</codeph>: The maximum level of detail, showing how work is split up + within each node into <q>query fragments</q> that are connected in a pipeline. This extra detail is + primarily useful for low-level performance testing and tuning within Impala itself, rather than for + rewriting the SQL code at the user level. + </li> + </ul> + + <note> + Prior to Impala 1.3, the allowed argument range for <codeph>EXPLAIN_LEVEL</codeph> was 0 to 1: level 0 had + the mnemonic <codeph>NORMAL</codeph>, and level 1 was <codeph>VERBOSE</codeph>. In Impala 1.3 and higher, + <codeph>NORMAL</codeph> is not a valid mnemonic value, and <codeph>VERBOSE</codeph> still applies to the + highest level of detail but now corresponds to level 3. You might need to adjust the values if you have any + older <codeph>impala-shell</codeph> script files that set the <codeph>EXPLAIN_LEVEL</codeph> query option. + </note> + + <p> + Changing the value of this option controls the amount of detail in the output of the <codeph>EXPLAIN</codeph> + statement. The extended information from level 2 or 3 is especially useful during performance tuning, when + you need to confirm whether the work for the query is distributed the way you expect, particularly for the + most resource-intensive operations such as join queries against large tables, queries against tables with + large numbers of partitions, and insert operations for Parquet tables. The extended information also helps to + check estimated resource usage when you use the admission control or resource management features explained + in <xref href="impala_resource_management.xml#resource_management"/>. See + <xref href="impala_explain.xml#explain"/> for the syntax of the <codeph>EXPLAIN</codeph> statement, and + <xref href="impala_explain_plan.xml#perf_explain"/> for details about how to use the extended information. + </p> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + As always, read the <codeph>EXPLAIN</codeph> output from bottom to top. The lowest lines represent the + initial work of the query (scanning data files), the lines in the middle represent calculations done on each + node and how intermediate results are transmitted from one node to another, and the topmost lines represent + the final results being sent back to the coordinator node. + </p> + + <p> + The numbers in the left column are generated internally during the initial planning phase and do not + represent the actual order of operations, so it is not significant if they appear out of order in the + <codeph>EXPLAIN</codeph> output. + </p> + + <p> + At all <codeph>EXPLAIN</codeph> levels, the plan contains a warning if any tables in the query are missing + statistics. Use the <codeph>COMPUTE STATS</codeph> statement to gather statistics for each table and suppress + this warning. See <xref href="impala_perf_stats.xml#perf_stats"/> for details about how the statistics help + query performance. + </p> + + <p> + The <codeph>PROFILE</codeph> command in <cmdname>impala-shell</cmdname> always starts with an explain plan + showing full detail, the same as with <codeph>EXPLAIN_LEVEL=3</codeph>. <ph rev="1.4.0">After the explain + plan comes the executive summary, the same output as produced by the <codeph>SUMMARY</codeph> command in + <cmdname>impala-shell</cmdname>.</ph> + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + + <p> + These examples use a trivial, empty table to illustrate how the essential aspects of query planning are shown + in <codeph>EXPLAIN</codeph> output: + </p> + +<codeblock>[localhost:21000] > create table t1 (x int, s string); +[localhost:21000] > set explain_level=1; +[localhost:21000] > explain select count(*) from t1; ++------------------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=10.00MB VCores=1 | +| WARNING: The following tables are missing relevant table and/or column statistics. | +| explain_plan.t1 | +| | +| 03:AGGREGATE [MERGE FINALIZE] | +| | output: sum(count(*)) | +| | | +| 02:EXCHANGE [PARTITION=UNPARTITIONED] | +| | | +| 01:AGGREGATE | +| | output: count(*) | +| | | +| 00:SCAN HDFS [explain_plan.t1] | +| partitions=1/1 size=0B | ++------------------------------------------------------------------------------------+ +[localhost:21000] > explain select * from t1; ++------------------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | +| WARNING: The following tables are missing relevant table and/or column statistics. | +| explain_plan.t1 | +| | +| 01:EXCHANGE [PARTITION=UNPARTITIONED] | +| | | +| 00:SCAN HDFS [explain_plan.t1] | +| partitions=1/1 size=0B | ++------------------------------------------------------------------------------------+ +[localhost:21000] > set explain_level=2; +[localhost:21000] > explain select * from t1; ++------------------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | +| WARNING: The following tables are missing relevant table and/or column statistics. | +| explain_plan.t1 | +| | +| 01:EXCHANGE [PARTITION=UNPARTITIONED] | +| | hosts=0 per-host-mem=unavailable | +| | tuple-ids=0 row-size=19B cardinality=unavailable | +| | | +| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] | +| partitions=1/1 size=0B | +| table stats: unavailable | +| column stats: unavailable | +| hosts=0 per-host-mem=0B | +| tuple-ids=0 row-size=19B cardinality=unavailable | ++------------------------------------------------------------------------------------+ +[localhost:21000] > set explain_level=3; +[localhost:21000] > explain select * from t1; ++------------------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | +<b>| WARNING: The following tables are missing relevant table and/or column statistics. |</b> +<b>| explain_plan.t1 |</b> +| | +| F01:PLAN FRAGMENT [PARTITION=UNPARTITIONED] | +| 01:EXCHANGE [PARTITION=UNPARTITIONED] | +| hosts=0 per-host-mem=unavailable | +| tuple-ids=0 row-size=19B cardinality=unavailable | +| | +| F00:PLAN FRAGMENT [PARTITION=RANDOM] | +| DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, PARTITION=UNPARTITIONED] | +| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] | +| partitions=1/1 size=0B | +<b>| table stats: unavailable |</b> +<b>| column stats: unavailable |</b> +| hosts=0 per-host-mem=0B | +| tuple-ids=0 row-size=19B cardinality=unavailable | ++------------------------------------------------------------------------------------+ +</codeblock> + + <p> + As the warning message demonstrates, most of the information needed for Impala to do efficient query + planning, and for you to understand the performance characteristics of the query, requires running the + <codeph>COMPUTE STATS</codeph> statement for the table: + </p> + +<codeblock>[localhost:21000] > compute stats t1; ++-----------------------------------------+ +| summary | ++-----------------------------------------+ +| Updated 1 partition(s) and 2 column(s). | ++-----------------------------------------+ +[localhost:21000] > explain select * from t1; ++------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | +| | +| F01:PLAN FRAGMENT [PARTITION=UNPARTITIONED] | +| 01:EXCHANGE [PARTITION=UNPARTITIONED] | +| hosts=0 per-host-mem=unavailable | +| tuple-ids=0 row-size=20B cardinality=0 | +| | +| F00:PLAN FRAGMENT [PARTITION=RANDOM] | +| DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, PARTITION=UNPARTITIONED] | +| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] | +| partitions=1/1 size=0B | +<b>| table stats: 0 rows total |</b> +<b>| column stats: all |</b> +| hosts=0 per-host-mem=0B | +| tuple-ids=0 row-size=20B cardinality=0 | ++------------------------------------------------------------------------+ +</codeblock> + + <p> + Joins and other complicated, multi-part queries are the ones where you most commonly need to examine the + <codeph>EXPLAIN</codeph> output and customize the amount of detail in the output. This example shows the + default <codeph>EXPLAIN</codeph> output for a three-way join query, then the equivalent output with a + <codeph>[SHUFFLE]</codeph> hint to change the join mechanism between the first two tables from a broadcast + join to a shuffle join. + </p> + +<codeblock>[localhost:21000] > set explain_level=1; +[localhost:21000] > explain select one.*, two.*, three.* from t1 one, t1 two, t1 three where one.x = two.x and two.x = three.x; ++------------------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 | +| | +| 07:EXCHANGE [PARTITION=UNPARTITIONED] | +| | | +<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b> +| | hash predicates: two.x = three.x | +| | | +<b>| |--06:EXCHANGE [BROADCAST] |</b> +| | | | +| | 02:SCAN HDFS [explain_plan.t1 three] | +| | partitions=1/1 size=0B | +| | | +<b>| 03:HASH JOIN [INNER JOIN, BROADCAST] |</b> +| | hash predicates: one.x = two.x | +| | | +<b>| |--05:EXCHANGE [BROADCAST] |</b> +| | | | +| | 01:SCAN HDFS [explain_plan.t1 two] | +| | partitions=1/1 size=0B | +| | | +| 00:SCAN HDFS [explain_plan.t1 one] | +| partitions=1/1 size=0B | ++------------------------------------------------------------------------------------+ +[localhost:21000] > explain select one.*, two.*, three.* from t1 one join [shuffle] t1 two join t1 three where one.x = two.x and two.x = three.x; ++------------------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 | +| | +| 08:EXCHANGE [PARTITION=UNPARTITIONED] | +| | | +<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b> +| | hash predicates: two.x = three.x | +| | | +<b>| |--07:EXCHANGE [BROADCAST] |</b> +| | | | +| | 02:SCAN HDFS [explain_plan.t1 three] | +| | partitions=1/1 size=0B | +| | | +<b>| 03:HASH JOIN [INNER JOIN, PARTITIONED] |</b> +| | hash predicates: one.x = two.x | +| | | +<b>| |--06:EXCHANGE [PARTITION=HASH(two.x)] |</b> +| | | | +| | 01:SCAN HDFS [explain_plan.t1 two] | +| | partitions=1/1 size=0B | +| | | +<b>| 05:EXCHANGE [PARTITION=HASH(one.x)] |</b> +| | | +| 00:SCAN HDFS [explain_plan.t1 one] | +| partitions=1/1 size=0B | ++------------------------------------------------------------------------------------+ +</codeblock> + + <p> + For a join involving many different tables, the default <codeph>EXPLAIN</codeph> output might stretch over + several pages, and the only details you care about might be the join order and the mechanism (broadcast or + shuffle) for joining each pair of tables. In that case, you might set <codeph>EXPLAIN_LEVEL</codeph> to its + lowest value of 0, to focus on just the join order and join mechanism for each stage. The following example + shows how the rows from the first and second joined tables are hashed and divided among the nodes of the + cluster for further filtering; then the entire contents of the third table are broadcast to all nodes for the + final stage of join processing. + </p> + +<codeblock>[localhost:21000] > set explain_level=0; +[localhost:21000] > explain select one.*, two.*, three.* from t1 one join [shuffle] t1 two join t1 three where one.x = two.x and two.x = three.x; ++---------------------------------------------------------+ +| Explain String | ++---------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 | +| | +| 08:EXCHANGE [PARTITION=UNPARTITIONED] | +<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b> +<b>| |--07:EXCHANGE [BROADCAST] |</b> +| | 02:SCAN HDFS [explain_plan.t1 three] | +<b>| 03:HASH JOIN [INNER JOIN, PARTITIONED] |</b> +<b>| |--06:EXCHANGE [PARTITION=HASH(two.x)] |</b> +| | 01:SCAN HDFS [explain_plan.t1 two] | +<b>| 05:EXCHANGE [PARTITION=HASH(one.x)] |</b> +| 00:SCAN HDFS [explain_plan.t1 one] | ++---------------------------------------------------------+ +</codeblock> + +<!-- Consider adding a related info section to collect the xrefs earlier on this page. --> + + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/463ddf92/docs/topics/impala_float.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_float.xml b/docs/topics/impala_float.xml new file mode 100644 index 0000000..51e3311 --- /dev/null +++ b/docs/topics/impala_float.xml @@ -0,0 +1,94 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="float"> + + <title>FLOAT Data Type</title> + <titlealts><navtitle>FLOAT</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Data Types"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Data Analysts"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Schemas"/> + </metadata> + </prolog> + + <conbody> + + <p> + A single precision floating-point data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER + TABLE</codeph> statements. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + + <p> + In the column definition of a <codeph>CREATE TABLE</codeph> statement: + </p> + +<codeblock><varname>column_name</varname> FLOAT</codeblock> + + <p> + <b>Range:</b> 1.40129846432481707e-45 .. 3.40282346638528860e+38, positive or negative + </p> + + <p> + <b>Precision:</b> 6 to 9 significant digits, depending on usage. The number of significant digits does + not depend on the position of the decimal point. + </p> + + <p> + <b>Representation:</b> The values are stored in 4 bytes, using + <xref href="https://en.wikipedia.org/wiki/Single-precision_floating-point_format" scope="external" format="html">IEEE 754 Single Precision Binary Floating Point</xref> format. + </p> + + <p> + <b>Conversions:</b> Impala automatically converts <codeph>FLOAT</codeph> to more precise + <codeph>DOUBLE</codeph> values, but not the other way around. You can use <codeph>CAST()</codeph> to convert + <codeph>FLOAT</codeph> values to <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, <codeph>INT</codeph>, + <codeph>BIGINT</codeph>, <codeph>STRING</codeph>, <codeph>TIMESTAMP</codeph>, or <codeph>BOOLEAN</codeph>. + You can use exponential notation in <codeph>FLOAT</codeph> literals or when casting from + <codeph>STRING</codeph>, for example <codeph>1.0e6</codeph> to represent one million. + <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/> + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + +<codeblock>CREATE TABLE t1 (x FLOAT); +SELECT CAST(1000.5 AS FLOAT); +</codeblock> + + <p conref="../shared/impala_common.xml#common/partitioning_imprecise"/> + + <p conref="../shared/impala_common.xml#common/hbase_ok"/> + + <p conref="../shared/impala_common.xml#common/parquet_ok"/> + + <p conref="../shared/impala_common.xml#common/text_bulky"/> + +<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> --> + + <p conref="../shared/impala_common.xml#common/internals_4_bytes"/> + +<!-- <p conref="/Content/impala_common_xi44078.xml#common/added_in_20"/> --> + + <p conref="../shared/impala_common.xml#common/column_stats_constant"/> + + <p conref="../shared/impala_common.xml#common/restrictions_blurb"/> + +<!-- This conref appears under SUM(), AVG(), FLOAT, and DOUBLE topics. --> + + <p conref="../shared/impala_common.xml#common/sum_double"/> + + <p conref="../shared/impala_common.xml#common/float_double_decimal_caveat"/> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_math_functions.xml#math_functions"/>, + <xref href="impala_double.xml#double"/> + </p> + </conbody> +</concept>
