http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_bigint.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_bigint.xml b/docs/topics/impala_bigint.xml index 8f31bc6..d9223b0 100644 --- a/docs/topics/impala_bigint.xml +++ b/docs/topics/impala_bigint.xml @@ -3,7 +3,7 @@ <concept id="bigint"> <title>BIGINT Data Type</title> - <titlealts><navtitle>BIGINT</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>BIGINT</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -74,11 +74,11 @@ SELECT CAST(1000 AS BIGINT); <p conref="../shared/impala_common.xml#common/hbase_ok"/> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/parquet_blurb"/> --> +<!-- <p conref="../shared/impala_common.xml#common/parquet_blurb"/> --> <p conref="../shared/impala_common.xml#common/text_bulky"/> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> --> +<!-- <p conref="../shared/impala_common.xml#common/compatibility_blurb"/> --> <p conref="../shared/impala_common.xml#common/internals_8_bytes"/> @@ -86,7 +86,9 @@ SELECT CAST(1000 AS BIGINT); <p conref="../shared/impala_common.xml#common/column_stats_constant"/> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> --> + <p conref="../shared/impala_common.xml#common/sqoop_blurb"/> + + <p conref="../shared/impala_common.xml#common/sqoop_timestamp_caveat"/> <p conref="../shared/impala_common.xml#common/related_info"/>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_bit_functions.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_bit_functions.xml b/docs/topics/impala_bit_functions.xml index 77c7e5d..2ac0f68 100644 --- a/docs/topics/impala_bit_functions.xml +++ b/docs/topics/impala_bit_functions.xml @@ -2,7 +2,7 @@ <concept id="bit_functions" rev="2.3.0"> <title>Impala Bit Functions</title> - <titlealts><navtitle>Bit Functions</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>Bit Functions</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -16,7 +16,7 @@ <conbody> - <p> + <p rev="2.3.0"> Bit manipulation functions perform bitwise operations involved in scientific processing or computer science algorithms. For example, these functions include setting, clearing, or testing bits within an integer value, or changing the positions of bits with or without wraparound. @@ -68,10 +68,6 @@ shiftleft shiftright --> -<!-- Include this conref for all the bit functions, all newly added in Impala 2.3.0. - <p conref="../shared/impala_common.xml#common/added_in_230"/> ---> - <dl> <dlentry id="bitand"> @@ -264,7 +260,7 @@ select bitor(0,7); /* 00000000 | 00000111 */ The following examples show the results of XORing integer values. XORing a non-zero value with zero returns the non-zero value. XORing two identical values returns zero, because all the 1 bits from the first argument are also 1 bits in the second argument. - XORing different non-zero values turns off some bits and leaves others turned on, based on whether the same bit is set in both arguments. + XORing different non-zero values turns off some bits and leaves others turned on, based on whether the same bit is set in both arguments. </p> <codeblock>select bitxor(0,15); /* 00000000 ^ 00001111 */ +---------------+ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_boolean.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_boolean.xml b/docs/topics/impala_boolean.xml index 6a8e299..c788b33 100644 --- a/docs/topics/impala_boolean.xml +++ b/docs/topics/impala_boolean.xml @@ -3,7 +3,7 @@ <concept id="boolean"> <title>BOOLEAN Data Type</title> - <titlealts><navtitle>BOOLEAN</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>BOOLEAN</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -43,13 +43,39 @@ </p> <p> - You can use <codeph>CAST()</codeph> to convert <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, + You can use <codeph>CAST()</codeph> to convert +<!-- + <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, <codeph>INT</codeph>, <codeph>BIGINT</codeph>, <codeph>FLOAT</codeph>, or <codeph>DOUBLE</codeph> -<!-- any integer or floating-point type to --> +--> + any integer or floating-point type to <codeph>BOOLEAN</codeph>: a value of 0 represents <codeph>false</codeph>, and any non-zero value is converted to <codeph>true</codeph>. </p> +<codeblock>SELECT CAST(42 AS BOOLEAN) AS nonzero_int, CAST(99.44 AS BOOLEAN) AS nonzero_decimal, + CAST(000 AS BOOLEAN) AS zero_int, CAST(0.0 AS BOOLEAN) AS zero_decimal; ++-------------+-----------------+----------+--------------+ +| nonzero_int | nonzero_decimal | zero_int | zero_decimal | ++-------------+-----------------+----------+--------------+ +| true | true | false | false | ++-------------+-----------------+----------+--------------+ +</codeblock> + + <p> + When you cast the opposite way, from <codeph>BOOLEAN</codeph> to a numeric type, + the result becomes either 1 or 0: + </p> + +<codeblock>SELECT CAST(true AS INT) AS true_int, CAST(true AS DOUBLE) AS true_double, + CAST(false AS INT) AS false_int, CAST(false AS DOUBLE) AS false_double; ++----------+-------------+-----------+--------------+ +| true_int | true_double | false_int | false_double | ++----------+-------------+-----------+--------------+ +| 1 | 1 | 0 | 0 | ++----------+-------------+-----------+--------------+ +</codeblock> + <p rev="1.4.0"> <!-- BOOLEAN-to-DECIMAL casting requested in IMPALA-991. As of Sept. 2014, designated "won't fix". --> You can cast <codeph>DECIMAL</codeph> values to <codeph>BOOLEAN</codeph>, with the same treatment of zero and @@ -107,17 +133,17 @@ SELECT claim FROM assertions WHERE really = TRUE; <p conref="../shared/impala_common.xml#common/text_bulky"/> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> --> +<!-- <p conref="../shared/impala_common.xml#common/compatibility_blurb"/> --> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/internals_blurb"/> --> +<!-- <p conref="../shared/impala_common.xml#common/internals_blurb"/> --> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/added_in_20"/> --> +<!-- <p conref="../shared/impala_common.xml#common/added_in_20"/> --> <p conref="../shared/impala_common.xml#common/column_stats_constant"/> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/restrictions_blurb"/> --> +<!-- <p conref="../shared/impala_common.xml#common/restrictions_blurb"/> --> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/related_info"/> --> +<!-- <p conref="../shared/impala_common.xml#common/related_info"/> --> <p> <b>Related information:</b> <xref href="impala_literals.xml#boolean_literals"/>, http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_char.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_char.xml b/docs/topics/impala_char.xml index 68cabeb..94b9401 100644 --- a/docs/topics/impala_char.xml +++ b/docs/topics/impala_char.xml @@ -3,7 +3,7 @@ <concept id="char" rev="2.0.0"> <title>CHAR Data Type (CDH 5.2 or higher only)</title> - <titlealts><navtitle>CHAR (CDH 5.2 or higher only)</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>CHAR</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -17,7 +17,7 @@ <conbody> - <p> + <p rev="2.0.0"> <indexterm audience="Cloudera">CHAR data type</indexterm> A fixed-length character type, padded with trailing spaces if necessary to achieve the specified length. If values are longer than the specified length, Impala truncates any trailing characters. @@ -101,6 +101,9 @@ <codeph>CHAR</codeph> columns. </p> + <p><b>Avro considerations:</b></p> + <p conref="../shared/impala_common.xml#common/avro_2gb_strings"/> + <p conref="../shared/impala_common.xml#common/compatibility_blurb"/> <p> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_comments.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_comments.xml b/docs/topics/impala_comments.xml index 96b9479..07531dc 100644 --- a/docs/topics/impala_comments.xml +++ b/docs/topics/impala_comments.xml @@ -7,6 +7,8 @@ <metadata> <data name="Category" value="Impala"/> <data name="Category" value="SQL"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_complex_types.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_complex_types.xml b/docs/topics/impala_complex_types.xml index 9fe7362..77e9707 100644 --- a/docs/topics/impala_complex_types.xml +++ b/docs/topics/impala_complex_types.xml @@ -2,25 +2,28 @@ <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> <concept rev="2.3.0" id="complex_types"> - <title id="nested_types">Complex Types (CDH 5.5 and higher only)</title> + <title id="nested_types">Complex Types (CDH 5.5 or higher only)</title> <prolog> <metadata> <data name="Category" value="Impala"/> <data name="Category" value="Impala Data Types"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> <conbody> - <p> + <p rev="2.3.0"> <indexterm audience="Cloudera">complex types</indexterm> <indexterm audience="Cloudera">nested types</indexterm> <term>Complex types</term> (also referred to as <term>nested types</term>) let you represent multiple data values within a single row/column position. They differ from the familiar column types such as <codeph>BIGINT</codeph> and <codeph>STRING</codeph>, known as <term>scalar types</term> or <term>primitive types</term>, which represent a single data value within a given row/column position. - Impala supports the complex types <codeph>ARRAY</codeph>, <codeph>MAP</codeph>, and <codeph>STRUCT</codeph> in Impala 2.3 / CDH 5.5 + Impala supports the complex types <codeph>ARRAY</codeph>, <codeph>MAP</codeph>, and <codeph>STRUCT</codeph> in CDH 5.5 / Impala 2.3 and higher. The Hive <codeph>UNION</codeph> type is not currently supported. </p> @@ -345,6 +348,13 @@ nested type data and Impala queries on that table will generate errors. </p> + <note rev="2.6.0 IMPALA-2844"> + <p rev="2.6.0 IMPALA-2844"> + The one exception to the preceding rule is <codeph>COUNT(*)</codeph> queries on RCFile tables that include complex types. + Such queries are allowed in CDH 5.8 / Impala 2.6 and higher. + </p> + </note> + <p> You can perform DDL operations (even <codeph>CREATE TABLE</codeph>) for tables involving complex types in file formats other than Parquet. The DDL support lets you set up intermediate tables in your ETL pipeline, to be populated by Hive, before the final stage @@ -565,6 +575,8 @@ The maximum depth of nesting for complex types is 100 levels. </p> + <p conref="../shared/impala_common.xml#common/complex_types_max_length"/> + <p> For ideal performance and scalability, use small or medium-sized collections, where all the complex columns contain at most a few hundred megabytes per row. Remember, all the columns of a row are stored in the same HDFS data block, whose size in Parquet files @@ -638,7 +650,7 @@ <codeblock> primitive_type | array_type | map_type -| struct_type +| struct_type </codeblock> <p> @@ -1472,7 +1484,7 @@ WHERE associates.item LIKE '% MacGuffin'; <codeblock>[localhost:21000] > SELECT r1.r_name, r2.n_name, <b>r2.POS</b> > FROM region r1 INNER JOIN r1.r_nations r2 - > WHERE r1.r_name = 'ASIA'; + > WHERE r1.r_name = 'ASIA'; +--------+-----------+-----+ | r_name | n_name | pos | +--------+-----------+-----+ @@ -1492,7 +1504,7 @@ WHERE associates.item LIKE '% MacGuffin'; <codeblock>[localhost:21000] > SELECT r1.r_name, r2.n_name, r2.POS > FROM region r1 INNER JOIN r1.r_nations r2 > WHERE r1.r_name = 'ASIA' - > <b>ORDER BY r2.POS DESC</b>; + > <b>ORDER BY r2.POS DESC</b>; +--------+-----------+-----+ | r_name | n_name | pos | +--------+-----------+-----+ @@ -1504,7 +1516,7 @@ WHERE associates.item LIKE '% MacGuffin'; +--------+-----------+-----+ [localhost:21000] > SELECT r1.r_name, r2.n_name, r2.POS > FROM region r1 INNER JOIN r1.r_nations r2 - > WHERE r1.r_name = 'ASIA' AND <b>r2.POS BETWEEN 1 and 3</b>; + > WHERE r1.r_name = 'ASIA' AND <b>r2.POS BETWEEN 1 and 3</b>; +--------+-----------+-----+ | r_name | n_name | pos | +--------+-----------+-----+ @@ -2531,12 +2543,12 @@ CREATE table employee_vacations STORED AS PARQUET; -- Each kind of information to track gets its own "fact table". -CREATE table vacation_details +CREATE table vacation_details ( vacation_id BIGINT, vacation_start TIMESTAMP, - duration INT -) + duration INT +) STORED AS PARQUET; -- Any time we print a human-readable report, we join with this table to @@ -2598,7 +2610,7 @@ STORED AS PARQUET; <p> You can produce Parquet data files through several Hadoop components and APIs, as explained in - <xref audience="integrated" href="cdh_ig_parquet.xml#parquet_format"/><xref audience="standalone" href="http://www.cloudera.com/content/cloudera/en/documentation/core/latest/topics/cdh_ig_parquet.html" scope="external" format="html"/>. + <xref audience="integrated" href="cdh_ig_parquet.xml#parquet_format"/><xref audience="standalone" href="http://www.cloudera.com/documentation/enterprise/latest/topics/cdh_ig_parquet.html" scope="external" format="html"/>. </p> <p> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_components.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_components.xml b/docs/topics/impala_components.xml index bf09ab0..44e5c34 100644 --- a/docs/topics/impala_components.xml +++ b/docs/topics/impala_components.xml @@ -4,7 +4,15 @@ <title>Components of the Impala Server</title> <titlealts audience="PDF"><navtitle>Components</navtitle></titlealts> - + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Concepts"/> + <data name="Category" value="Administrators"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> <conbody> @@ -13,7 +21,160 @@ different daemon processes that run on specific hosts within your CDH cluster. </p> - + <p outputclass="toc inpage"/> + </conbody> + + <concept id="intro_impalad"> + + <title>The Impala Daemon</title> + + <conbody> + + <p> + The core Impala component is a daemon process that runs on each DataNode of the cluster, physically represented + by the <codeph>impalad</codeph> process. It reads and writes to data files; accepts queries transmitted + from the <codeph>impala-shell</codeph> command, Hue, JDBC, or ODBC; parallelizes the queries and + distributes work across the cluster; and transmits intermediate query results back to the + central coordinator node. + </p> + + <p> + You can submit a query to the Impala daemon running on any DataNode, and that instance of the daemon serves as the + <term>coordinator node</term> for that query. The other nodes transmit partial results back to the + coordinator, which constructs the final result set for a query. When running experiments with functionality + through the <codeph>impala-shell</codeph> command, you might always connect to the same Impala daemon for + convenience. For clusters running production workloads, you might load-balance by + submitting each query to a different Impala daemon in round-robin style, using the JDBC or ODBC interfaces. + </p> + + <p> + The Impala daemons are in constant communication with the <term>statestore</term>, to confirm which nodes + are healthy and can accept new work. + </p> + + <p rev="1.2"> + They also receive broadcast messages from the <cmdname>catalogd</cmdname> daemon (introduced in Impala 1.2) + whenever any Impala node in the cluster creates, alters, or drops any type of object, or when an + <codeph>INSERT</codeph> or <codeph>LOAD DATA</codeph> statement is processed through Impala. This + background communication minimizes the need for <codeph>REFRESH</codeph> or <codeph>INVALIDATE + METADATA</codeph> statements that were needed to coordinate metadata across nodes prior to Impala 1.2. + </p> + + <p> + <b>Related information:</b> <xref href="impala_config_options.xml#config_options"/>, + <xref href="impala_processes.xml#processes"/>, <xref href="impala_timeouts.xml#impalad_timeout"/>, + <xref href="impala_ports.xml#ports"/>, <xref href="impala_proxy.xml#proxy"/> + </p> </conbody> </concept> + <concept id="intro_statestore"> + + <title>The Impala Statestore</title> + + <conbody> + + <p> + The Impala component known as the <term>statestore</term> checks on the health of Impala daemons on all the + DataNodes in a cluster, and continuously relays its findings to each of those daemons. It is physically + represented by a daemon process named <codeph>statestored</codeph>; you only need such a process on one + host in the cluster. If an Impala daemon goes offline due to hardware failure, network error, software issue, + or other reason, the statestore informs all the other Impala daemons so that future queries can avoid making + requests to the unreachable node. + </p> + + <p> + Because the statestore's purpose is to help when things go wrong, it is not critical to the normal + operation of an Impala cluster. If the statestore is not running or becomes unreachable, the Impala daemons + continue running and distributing work among themselves as usual; the cluster just becomes less robust if + other Impala daemons fail while the statestore is offline. When the statestore comes back online, it re-establishes + communication with the Impala daemons and resumes its monitoring function. + </p> + + <p conref="../shared/impala_common.xml#common/statestored_catalogd_ha_blurb"/> + + <p> + <b>Related information:</b> + </p> + + <p> + <xref href="impala_scalability.xml#statestore_scalability"/>, + <xref href="impala_config_options.xml#config_options"/>, <xref href="impala_processes.xml#processes"/>, + <xref href="impala_timeouts.xml#statestore_timeout"/>, <xref href="impala_ports.xml#ports"/> + </p> + </conbody> + </concept> + + <concept rev="1.2" id="intro_catalogd"> + + <title>The Impala Catalog Service</title> + + <conbody> + + <p> + The Impala component known as the <term>catalog service</term> relays the metadata changes from Impala SQL + statements to all the DataNodes in a cluster. It is physically represented by a daemon process named + <codeph>catalogd</codeph>; you only need such a process on one host in the cluster. Because the requests + are passed through the statestore daemon, it makes sense to run the <cmdname>statestored</cmdname> and + <cmdname>catalogd</cmdname> services on the same host. + </p> + + <p> + The catalog service avoids the need to issue + <codeph>REFRESH</codeph> and <codeph>INVALIDATE METADATA</codeph> statements when the metadata changes are + performed by statements issued through Impala. When you create a table, load data, and so on through Hive, + you do need to issue <codeph>REFRESH</codeph> or <codeph>INVALIDATE METADATA</codeph> on an Impala node + before executing a query there. + </p> + + <p> + This feature touches a number of aspects of Impala: + </p> + +<!-- This was formerly a conref, but since the list of links also included a link + to this same topic, materializing the list here and removing that + circular link. (The conref is still used in Incompatible Changes.) + + <ul conref="../shared/impala_common.xml#common/catalogd_xrefs"> + <li/> + </ul> +--> + + <ul id="catalogd_xrefs"> + <li> + <p> + See <xref href="impala_install.xml#install"/>, <xref href="impala_upgrading.xml#upgrading"/> and + <xref href="impala_processes.xml#processes"/>, for usage information for the + <cmdname>catalogd</cmdname> daemon. + </p> + </li> + + <li> + <p> + The <codeph>REFRESH</codeph> and <codeph>INVALIDATE METADATA</codeph> statements are not needed + when the <codeph>CREATE TABLE</codeph>, <codeph>INSERT</codeph>, or other table-changing or + data-changing operation is performed through Impala. These statements are still needed if such + operations are done through Hive or by manipulating data files directly in HDFS, but in those cases the + statements only need to be issued on one Impala node rather than on all nodes. See + <xref href="impala_refresh.xml#refresh"/> and + <xref href="impala_invalidate_metadata.xml#invalidate_metadata"/> for the latest usage information for + those statements. + </p> + </li> + </ul> + + <p conref="../shared/impala_common.xml#common/load_catalog_in_background"/> + + <p conref="../shared/impala_common.xml#common/statestored_catalogd_ha_blurb"/> + + <note> + <p conref="../shared/impala_common.xml#common/catalog_server_124"/> + </note> + + <p> + <b>Related information:</b> <xref href="impala_config_options.xml#config_options"/>, + <xref href="impala_processes.xml#processes"/>, <xref href="impala_ports.xml#ports"/> + </p> + </conbody> + </concept> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_compression_codec.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_compression_codec.xml b/docs/topics/impala_compression_codec.xml index d99ac04..b00089f 100644 --- a/docs/topics/impala_compression_codec.xml +++ b/docs/topics/impala_compression_codec.xml @@ -2,7 +2,8 @@ <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> <concept rev="2.0.0" id="compression_codec"> - <title>COMPRESSION_CODEC Query Option</title> + <title>COMPRESSION_CODEC Query Option (CDH 5.2 or higher only)</title> + <titlealts audience="PDF"><navtitle>COMPRESSION_CODEC</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -11,7 +12,9 @@ <data name="Category" value="File Formats"/> <data name="Category" value="Parquet"/> <data name="Category" value="Snappy"/> - <data name="Category" value="GZip"/> + <data name="Category" value="Gzip"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> @@ -21,7 +24,7 @@ <!-- Could turn into a conref. --> - <p> + <p rev="2.0.0"> <indexterm audience="Cloudera">COMPRESSION_CODEC query option</indexterm> When Impala writes Parquet data files using the <codeph>INSERT</codeph> statement, the underlying compression is controlled by the <codeph>COMPRESSION_CODEC</codeph> query option. @@ -65,7 +68,7 @@ </p> <p> - <b>Default:</b> SNAPPY + <b>Default:</b> <codeph>SNAPPY</codeph> </p> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_compute_stats.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_compute_stats.xml b/docs/topics/impala_compute_stats.xml index abf6645..b915b77 100644 --- a/docs/topics/impala_compute_stats.xml +++ b/docs/topics/impala_compute_stats.xml @@ -3,14 +3,18 @@ <concept rev="1.2.2" id="compute_stats"> <title>COMPUTE STATS Statement</title> - <titlealts><navtitle>COMPUTE STATS</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>COMPUTE STATS</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> <data name="Category" value="Performance"/> <data name="Category" value="Scalability"/> + <data name="Category" value="ETL"/> + <data name="Category" value="Ingest"/> <data name="Category" value="SQL"/> <data name="Category" value="Tables"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> @@ -118,6 +122,16 @@ COMPUTE INCREMENTAL STATS [<varname>db_name</varname>.]<varname>table_name</varn See <xref href="impala_perf_stats.xml#perf_stats"/> for details. </p> + <p> + For large tables, the <codeph>COMPUTE STATS</codeph> statement itself might take a long time and you + might need to tune its performance. The <codeph>COMPUTE STATS</codeph> statement does not work with the + <codeph>EXPLAIN</codeph> statement, or the <codeph>SUMMARY</codeph> command in <cmdname>impala-shell</cmdname>. + You can use the <codeph>PROFILE</codeph> statement in <cmdname>impala-shell</cmdname> to examine timing information + for the statement as a whole. If a basic <codeph>COMPUTE STATS</codeph> statement takes a long time for a + partitioned table, consider switching to the <codeph>COMPUTE INCREMENTAL STATS</codeph> syntax so that only + newly added partitions are analyzed each time. + </p> + <p conref="../shared/impala_common.xml#common/example_blurb"/> <p> @@ -278,7 +292,7 @@ alter table item_partitioned add partition (i_category='Camping'); refresh item_partitioned; drop incremental stats item_partitioned partition (i_category='Sports'); -- Now some partitions have incremental stats --- and some don't. +-- and some do not. show table stats item_partitioned; +-------------+-------+--------+----------+--------------+---------+------------------ | i_category | #Rows | #Files | Size | Bytes Cached | Format | Incremental stats @@ -342,13 +356,13 @@ show table stats item_partitioned; <p> The <codeph>COMPUTE STATS</codeph> statement works with Parquet tables. These tables can be created through either Impala or Hive. - <note conref="../shared/impala_common.xml#common/compute_stats_parquet"/> </p> <p> - The <codeph>COMPUTE STATS</codeph> statement works with Avro tables, as long as they are created with - SQL-style column names and types rather than an Avro-style schema specification. These tables are currently - always created through Hive rather than Impala. + The <codeph>COMPUTE STATS</codeph> statement works with Avro tables without restriction in CDH 5.4 / Impala 2.2 + and higher. In earlier releases, <codeph>COMPUTE STATS</codeph> worked only for Avro tables created through Hive, + and required the <codeph>CREATE TABLE</codeph> statement to use SQL-style column names and types rather than an + Avro-style schema specification. </p> <p> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_conditional_functions.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_conditional_functions.xml b/docs/topics/impala_conditional_functions.xml index b922710..23de779 100644 --- a/docs/topics/impala_conditional_functions.xml +++ b/docs/topics/impala_conditional_functions.xml @@ -2,7 +2,7 @@ <concept id="conditional_functions"> <title>Impala Conditional Functions</title> - <titlealts><navtitle>Conditional Functions</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>Conditional Functions</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -196,7 +196,7 @@ <dlentry rev="1.3.0" id="ifnull"> <dt> - <codeph>ifnull(type a, type ifNotNull)</codeph> + <codeph>ifnull(type a, type ifNull)</codeph> </dt> <dd> @@ -266,7 +266,7 @@ <dlentry id="isnull"> <dt> - <codeph>isnull(type a, type ifNotNull)</codeph> + <codeph>isnull(type a, type ifNull)</codeph> </dt> <dd> @@ -304,10 +304,10 @@ </dlentry> - <dlentry id="notnullvalue" rev="2.2.0"> + <dlentry id="nonnullvalue" rev="2.2.0"> <dt> - <codeph>notnullvalue(<varname>expression</varname>)</codeph> + <codeph>nonnullvalue(<varname>expression</varname>)</codeph> </dt> <dd> @@ -390,7 +390,7 @@ END</codeblock> <indexterm audience="Cloudera">function</indexterm> <b>Purpose:</b> Tests if an expression (of any type) is <codeph>NULL</codeph> or not. Returns <codeph>true</codeph> if so. - The converse of <codeph>notnullvalue()</codeph>. + The converse of <codeph>nonnullvalue()</codeph>. <p conref="../shared/impala_common.xml#common/return_type_boolean"/> <p conref="../shared/impala_common.xml#common/for_compatibility_only"/> <p conref="../shared/impala_common.xml#common/added_in_220"/> @@ -401,7 +401,7 @@ END</codeblock> <dlentry id="nvl" rev="1.1"> <dt> - <codeph>nvl(type a, type ifNotNull)</codeph> + <codeph>nvl(type a, type ifNull)</codeph> </dt> <dd> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_conversion_functions.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_conversion_functions.xml b/docs/topics/impala_conversion_functions.xml index 1050d0c..36f5ec4 100644 --- a/docs/topics/impala_conversion_functions.xml +++ b/docs/topics/impala_conversion_functions.xml @@ -2,7 +2,7 @@ <concept id="conversion_functions"> <title>Impala Type Conversion Functions</title> - <titlealts><navtitle>Type Conversion Functions</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>Type Conversion Functions</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -25,7 +25,7 @@ overflow could occur. Also, for reporting or dealing with loosely defined schemas in big data contexts, you might frequently need to convert values to or from the <codeph>STRING</codeph> type. </p> - + <note> Although in CDH 5.5.0, the <codeph>SHOW FUNCTIONS</codeph> output for database <codeph>_IMPALA_BUILTINS</codeph> contains some function signatures http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_count.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_count.xml b/docs/topics/impala_count.xml index 2f3f519..4f9a697 100644 --- a/docs/topics/impala_count.xml +++ b/docs/topics/impala_count.xml @@ -3,7 +3,7 @@ <concept id="count"> <title>COUNT Function</title> - <titlealts><navtitle>COUNT</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>COUNT</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -12,6 +12,8 @@ <data name="Category" value="Analytic Functions"/> <data name="Category" value="Aggregate Functions"/> <data name="Category" value="Querying"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> @@ -55,10 +57,14 @@ <b>Return type:</b> <codeph>BIGINT</codeph> </p> + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p conref="../shared/impala_common.xml#common/partition_key_optimization"/> + <p conref="../shared/impala_common.xml#common/complex_types_blurb"/> <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/> - + <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/> <p conref="../shared/impala_common.xml#common/example_blurb"/> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_create_database.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_create_database.xml b/docs/topics/impala_create_database.xml index f4153e0..cb9dd84 100644 --- a/docs/topics/impala_create_database.xml +++ b/docs/topics/impala_create_database.xml @@ -3,7 +3,7 @@ <concept id="create_database"> <title>CREATE DATABASE Statement</title> - <titlealts><navtitle>CREATE DATABASE</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>CREATE DATABASE</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -11,6 +11,9 @@ <data name="Category" value="Databases"/> <data name="Category" value="Schemas"/> <data name="Category" value="DDL"/> + <data name="Category" value="S3"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> @@ -87,9 +90,28 @@ <p> When you create a database in Impala, the database can also be used by Hive. When you create a database in Hive, issue an <codeph>INVALIDATE METADATA</codeph> - statement in Impala to make Impala permanently aware of the new database. + statement in Impala to make Impala permanently aware of the new database. </p> + <p> + The <codeph>SHOW DATABASES</codeph> statement lists all databases, or the databases whose name + matches a wildcard pattern. <ph rev="2.5.0">In CDH 5.7 / Impala 2.5 and higher, the + <codeph>SHOW DATABASES</codeph> output includes a second column that displays the associated + comment, if any, for each database.</ph> + </p> + + <p conref="../shared/impala_common.xml#common/s3_blurb"/> + + <p rev="2.6.0 CDH-39913 IMPALA-1878"> + To specify that any tables created within a database reside on the Amazon S3 system, + you can include an <codeph>s3a://</codeph> prefix on the <codeph>LOCATION</codeph> + attribute. In CDH 5.8 / Impala 2.6 and higher, Impala automatically creates any + required folders as the databases, tables, and partitions are created, and removes + them when they are dropped. + </p> + + <p conref="../shared/impala_common.xml#common/s3_ddl"/> + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> <p conref="../shared/impala_common.xml#common/permissions_blurb"/> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_create_function.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_create_function.xml b/docs/topics/impala_create_function.xml index 4140289..77815f7 100644 --- a/docs/topics/impala_create_function.xml +++ b/docs/topics/impala_create_function.xml @@ -3,7 +3,7 @@ <concept rev="1.2" id="create_function"> <title>CREATE FUNCTION Statement</title> - <titlealts><navtitle>CREATE FUNCTION</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>CREATE FUNCTION</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -12,6 +12,8 @@ <data name="Category" value="Schemas"/> <data name="Category" value="Impala Functions"/> <data name="Category" value="UDFs"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> @@ -31,17 +33,39 @@ multiple functions that compute intermediate results across sets of rows. </p> + <p rev="2.5.0 IMPALA-2843 CDH-39148"> + In CDH 5.7 / Impala 2.5 and higher, the syntax is also different for creating or dropping scalar Java-based UDFs. + The statements for Java UDFs use a new syntax, without any argument types or return type specified. Java-based UDFs + created using the new syntax persist across restarts of the Impala catalog server, and can be shared transparently + between Impala and Hive. + </p> + <p> - To create a scalar UDF, issue a <codeph>CREATE FUNCTION</codeph> statement: + To create a persistent scalar C++ UDF with <codeph>CREATE FUNCTION</codeph>: </p> <codeblock>CREATE FUNCTION [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>function_name</varname>([<varname>arg_type</varname>[, <varname>arg_type</varname>...]) RETURNS <varname>return_type</varname> - LOCATION '<varname>hdfs_path</varname>' - SYMBOL='<varname>symbol_or_class</varname>'</codeblock> + LOCATION '<varname>hdfs_path_to_dot_so</varname>' + SYMBOL='<varname>symbol_name</varname>'</codeblock> + + <p rev="2.5.0 IMPALA-2843 CDH-39148"> + To create a persistent Java UDF with <codeph>CREATE FUNCTION</codeph>: +<codeblock>CREATE FUNCTION [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>function_name</varname> + LOCATION '<varname>hdfs_path_to_jar</varname>' + SYMBOL='<varname>class_name</varname>'</codeblock> + </p> + +<!-- +Examples: +CREATE FUNCTION IF NOT EXISTS foo location '/path/to/jar' SYMBOL='TestUdf'; +CREATE FUNCTION bar location '/path/to/jar' SYMBOL='TestUdf2'; +DROP FUNCTION foo; +DROP FUNCTION IF EXISTS bar; +--> <p> - To create a UDA, issue a <codeph>CREATE AGGREGATE FUNCTION</codeph> statement: + To create a persistent UDA, which must be written in C++, issue a <codeph>CREATE AGGREGATE FUNCTION</codeph> statement: </p> <codeblock>CREATE [AGGREGATE] FUNCTION [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>function_name</varname>([<varname>arg_type</varname>[, <varname>arg_type</varname>...]) @@ -54,7 +78,7 @@ [CLOSEFN='<varname>function</varname>] <ph rev="2.0.0">[SERIALIZE_FN='<varname>function</varname>]</ph> [FINALIZE_FN='<varname>function</varname>] -<!-- [INTERMEDIATE <varname>type_spec</varname>] --></codeblock> + <ph rev="2.3.0 IMPALA-1829 CDH-30572">[INTERMEDIATE <varname>type_spec</varname>]</ph></codeblock> <p conref="../shared/impala_common.xml#common/ddl_blurb"/> @@ -62,6 +86,12 @@ <b>Varargs notation:</b> </p> + <note rev="CDH-39271 CDH-38572"> + <p rev="CDH-39271 CDH-38572"> + Variable-length argument lists are supported for C++ UDFs, but currently not for Java UDFs. + </p> + </note> + <p> If the underlying implementation of your function accepts a variable number of arguments: </p> @@ -91,8 +121,8 @@ </li> </ul> - <p> - See <xref href="impala_udf.xml#udf_varargs"/> for how to code the C++ or Java function to accept + <p rev="CDH-39271 CDH-38572"> + See <xref href="impala_udf.xml#udf_varargs"/> for how to code a C++ UDF to accept variable-length argument lists. </p> @@ -196,6 +226,12 @@ most suited to reusing existing Hive UDFs. (Impala can run Java-based Hive UDFs but not Hive UDAs.) </li> + <li rev="2.5.0 IMPALA-1748 CDH-38369 IMPALA-2843 CDH-39148"> + CDH 5.7 / Impala 2.5 introduces UDF improvements to persistence for both C++ and Java UDFs, + and better compatibility between Impala and Hive for Java UDFs. + See <xref href="impala_udf.xml#udfs"/> for details. + </li> + <li> The body of the UDF is represented by a <codeph>.so</codeph> or <codeph>.jar</codeph> file, which you store in HDFS and the <codeph>CREATE FUNCTION</codeph> statement distributes to each Impala node. @@ -273,7 +309,9 @@ not Impala UDFs written in C++. </p> - <p conref="../shared/impala_common.xml#common/restrictions_blurb"/> + <p conref="../shared/impala_common.xml#common/current_user_caveat"/> + + <p><b>Persistence:</b></p> <p conref="../shared/impala_common.xml#common/udf_persistence_restriction"/> @@ -281,6 +319,169 @@ <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> + + <p> + For additional examples of all kinds of user-defined functions, see <xref href="impala_udf.xml#udfs"/>. + </p> + + <p rev="2.5.0 IMPALA-2843 CDH-39148"> + The following example shows how to take a Java jar file and make all the functions inside one of its classes + into UDFs under a single (overloaded) function name in Impala. Each <codeph>CREATE FUNCTION</codeph> or + <codeph>DROP FUNCTION</codeph> statement applies to all the overloaded Java functions with the same name. + This example uses the signatureless syntax for <codeph>CREATE FUNCTION</codeph> and <codeph>DROP FUNCTION</codeph>, + which is available in CDH 5.7 / Impala 2.5 and higher. + </p> + <p rev="2.5.0 IMPALA-2843 CDH-39148"> + At the start, the jar file is in the local filesystem. Then it is copied into HDFS, so that it is + available for Impala to reference through the <codeph>CREATE FUNCTION</codeph> statement and + queries that refer to the Impala function name. + </p> +<codeblock rev="2.5.0 IMPALA-2843 CDH-39148"> +$ jar -tvf udf-examples-cdh570.jar + 0 Mon Feb 22 04:06:50 PST 2016 META-INF/ + 122 Mon Feb 22 04:06:48 PST 2016 META-INF/MANIFEST.MF + 0 Mon Feb 22 04:06:46 PST 2016 com/ + 0 Mon Feb 22 04:06:46 PST 2016 com/cloudera/ + 0 Mon Feb 22 04:06:46 PST 2016 com/cloudera/impala/ + 2460 Mon Feb 22 04:06:46 PST 2016 com/cloudera/impala/IncompatibleUdfTest.class + 541 Mon Feb 22 04:06:46 PST 2016 com/cloudera/impala/TestUdfException.class + 3438 Mon Feb 22 04:06:46 PST 2016 com/cloudera/impala/JavaUdfTest.class + 5872 Mon Feb 22 04:06:46 PST 2016 com/cloudera/impala/TestUdf.class +... +$ hdfs dfs -put udf-examples-cdh570.jar /user/impala/udfs +$ hdfs dfs -ls /user/impala/udfs +Found 2 items +-rw-r--r-- 3 jrussell supergroup 853 2015-10-09 14:05 /user/impala/udfs/hello_world.jar +-rw-r--r-- 3 jrussell supergroup 7366 2016-06-08 14:25 /user/impala/udfs/udf-examples-cdh570.jar +</codeblock> + <p rev="2.5.0 IMPALA-2843 CDH-39148"> + In <cmdname>impala-shell</cmdname>, the <codeph>CREATE FUNCTION</codeph> refers to the HDFS path of the jar file + and the fully qualified class name inside the jar. Each of the functions inside the class becomes an + Impala function, each one overloaded under the specified Impala function name. + </p> +<codeblock rev="2.5.0 IMPALA-2843 CDH-39148"> +[localhost:21000] > create function testudf location '/user/impala/udfs/udf-examples-cdh570.jar' symbol='com.cloudera.impala.TestUdf'; +[localhost:21000] > show functions; ++-------------+---------------------------------------+-------------+---------------+ +| return type | signature | binary type | is persistent | ++-------------+---------------------------------------+-------------+---------------+ +| BIGINT | testudf(BIGINT) | JAVA | true | +| BOOLEAN | testudf(BOOLEAN) | JAVA | true | +| BOOLEAN | testudf(BOOLEAN, BOOLEAN) | JAVA | true | +| BOOLEAN | testudf(BOOLEAN, BOOLEAN, BOOLEAN) | JAVA | true | +| DOUBLE | testudf(DOUBLE) | JAVA | true | +| DOUBLE | testudf(DOUBLE, DOUBLE) | JAVA | true | +| DOUBLE | testudf(DOUBLE, DOUBLE, DOUBLE) | JAVA | true | +| FLOAT | testudf(FLOAT) | JAVA | true | +| FLOAT | testudf(FLOAT, FLOAT) | JAVA | true | +| FLOAT | testudf(FLOAT, FLOAT, FLOAT) | JAVA | true | +| INT | testudf(INT) | JAVA | true | +| DOUBLE | testudf(INT, DOUBLE) | JAVA | true | +| INT | testudf(INT, INT) | JAVA | true | +| INT | testudf(INT, INT, INT) | JAVA | true | +| SMALLINT | testudf(SMALLINT) | JAVA | true | +| SMALLINT | testudf(SMALLINT, SMALLINT) | JAVA | true | +| SMALLINT | testudf(SMALLINT, SMALLINT, SMALLINT) | JAVA | true | +| STRING | testudf(STRING) | JAVA | true | +| STRING | testudf(STRING, STRING) | JAVA | true | +| STRING | testudf(STRING, STRING, STRING) | JAVA | true | +| TINYINT | testudf(TINYINT) | JAVA | true | ++-------------+---------------------------------------+-------------+---------------+ +</codeblock> + <p rev="2.5.0 IMPALA-2843 CDH-39148"> + These are all simple functions that return their single arguments, or + sum, concatenate, and so on their multiple arguments. Impala determines which + overloaded function to use based on the number and types of the arguments. + </p> +<codeblock rev="2.5.0 IMPALA-2843 CDH-39148"> +insert into bigint_x values (1), (2), (4), (3); +select testudf(x) from bigint_x; ++-----------------+ +| udfs.testudf(x) | ++-----------------+ +| 1 | +| 2 | +| 4 | +| 3 | ++-----------------+ + +insert into int_x values (1), (2), (4), (3); +select testudf(x, x+1, x*x) from int_x; ++-------------------------------+ +| udfs.testudf(x, x + 1, x * x) | ++-------------------------------+ +| 4 | +| 9 | +| 25 | +| 16 | ++-------------------------------+ + +select testudf(x) from string_x; ++-----------------+ +| udfs.testudf(x) | ++-----------------+ +| one | +| two | +| four | +| three | ++-----------------+ +select testudf(x,x) from string_x; ++--------------------+ +| udfs.testudf(x, x) | ++--------------------+ +| oneone | +| twotwo | +| fourfour | +| threethree | ++--------------------+ +</codeblock> + + <p rev="2.5.0 IMPALA-2843 CDH-39148"> + The previous example used the same Impala function name as the name of the class. + This example shows how the Impala function name is independent of the underlying + Java class or function names. A second <codeph>CREATE FUNCTION</codeph> statement + results in a set of overloaded functions all named <codeph>my_func</codeph>, + to go along with the overloaded functions all named <codeph>testudf</codeph>. + </p> +<codeblock rev="2.5.0 IMPALA-2843 CDH-39148"> +create function my_func location '/user/impala/udfs/udf-examples-cdh570.jar' + symbol='com.cloudera.impala.TestUdf'; + +show functions; ++-------------+---------------------------------------+-------------+---------------+ +| return type | signature | binary type | is persistent | ++-------------+---------------------------------------+-------------+---------------+ +| BIGINT | my_func(BIGINT) | JAVA | true | +| BOOLEAN | my_func(BOOLEAN) | JAVA | true | +| BOOLEAN | my_func(BOOLEAN, BOOLEAN) | JAVA | true | +... +| BIGINT | testudf(BIGINT) | JAVA | true | +| BOOLEAN | testudf(BOOLEAN) | JAVA | true | +| BOOLEAN | testudf(BOOLEAN, BOOLEAN) | JAVA | true | +... +</codeblock> + <p rev="2.5.0 IMPALA-2843 CDH-39148"> + The corresponding <codeph>DROP FUNCTION</codeph> statement with no signature + drops all the overloaded functions with that name. + </p> +<codeblock rev="2.5.0 IMPALA-2843 CDH-39148"> +drop function my_func; +show functions; ++-------------+---------------------------------------+-------------+---------------+ +| return type | signature | binary type | is persistent | ++-------------+---------------------------------------+-------------+---------------+ +| BIGINT | testudf(BIGINT) | JAVA | true | +| BOOLEAN | testudf(BOOLEAN) | JAVA | true | +| BOOLEAN | testudf(BOOLEAN, BOOLEAN) | JAVA | true | +... +</codeblock> + <p rev="2.5.0 IMPALA-2843 CDH-39148"> + The signatureless <codeph>CREATE FUNCTION</codeph> syntax for Java UDFs ensures that + the functions shown in this example remain available after the Impala service + (specifically, the Catalog Server) are restarted. + </p> + <p conref="../shared/impala_common.xml#common/related_info"/> <p> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_create_role.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_create_role.xml b/docs/topics/impala_create_role.xml index 975ce15..8258b1f 100644 --- a/docs/topics/impala_create_role.xml +++ b/docs/topics/impala_create_role.xml @@ -3,14 +3,18 @@ <concept rev="1.4.0" id="create_role"> <title>CREATE ROLE Statement (CDH 5.2 or higher only)</title> - <titlealts><navtitle>CREATE ROLE (CDH 5.2 or higher only)</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>CREATE ROLE</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> <data name="Category" value="DDL"/> <data name="Category" value="SQL"/> <data name="Category" value="Sentry"/> + <data name="Category" value="Security"/> <data name="Category" value="Roles"/> + <data name="Category" value="Administrators"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> <!-- Consider whether to go deeper into categories like Security for the Sentry-related statements. --> </metadata> </prolog> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_create_table.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_create_table.xml b/docs/topics/impala_create_table.xml index cdaee4a..26f99c4 100644 --- a/docs/topics/impala_create_table.xml +++ b/docs/topics/impala_create_table.xml @@ -1,26 +1,29 @@ <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> -<concept id="create_table"> +<concept id="create_table" outputclass="impala sql_statement"> - <title>CREATE TABLE Statement</title> - <titlealts><navtitle>CREATE TABLE</navtitle></titlealts> + <title outputclass="impala_title sql_statement_title">CREATE TABLE Statement</title> + <titlealts audience="PDF"><navtitle>CREATE TABLE</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> <data name="Category" value="SQL"/> <data name="Category" value="DDL"/> <data name="Category" value="Impala Data Types"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> <data name="Category" value="HDFS Caching"/> <data name="Category" value="Tables"/> <data name="Category" value="Schemas"/> - <data audience="impala_next" name="Category" value="Kudu"/> + <data name="Category" value="S3"/> + <!-- <data name="Category" value="Kudu"/> --> </metadata> </prolog> <conbody> <p> - <indexterm audience="Cloudera">CREATE TABLE statement</indexterm> + <indexterm audience="HTML">CREATE TABLE statement</indexterm> Creates a new table and specifies its characteristics. While creating a table, you optionally specify aspects such as: </p> @@ -58,9 +61,9 @@ </p> <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> - [(<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)] - [COMMENT '<varname>table_comment</varname>'] + (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...) [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)] + [COMMENT '<varname>table_comment</varname>'] [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] [ [ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>] @@ -97,10 +100,11 @@ data_type: </p> <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname> + <ph rev="2.5.0">[PARTITIONED BY (<varname>col_name</varname>[, ...])]</ph> [COMMENT '<varname>table_comment</varname>'] [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] [ - [ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>] + [ROW FORMAT <varname>row_format</varname>] <ph rev="CDH-41501">[STORED AS <varname>ctas_file_format</varname>]</ph> ] [LOCATION '<varname>hdfs_path</varname>'] [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] @@ -143,6 +147,10 @@ file_format: | AVRO | SEQUENCEFILE | RCFILE + +<ph rev="CDH-41501">ctas_file_format: + PARQUET + | TEXTFILE</ph> </codeblock> <p conref="../shared/impala_common.xml#common/ddl_blurb"/> @@ -151,6 +159,29 @@ file_format: Consider adding here, or at least making inline links to the relevant keywords in the syntax spec above. --> + <p> + <b>Column definitions:</b> + </p> + + <p> + Depending on the form of the <codeph>CREATE TABLE</codeph> statement, the column definitions are + required or not allowed. + </p> + + <p> + With the <codeph>CREATE TABLE AS SELECT</codeph> and <codeph>CREATE TABLE LIKE</codeph> + syntax, you do not specify the columns at all; the column names and types are derived from the source table, query, + or data file. + </p> + + <p> + With the basic <codeph>CREATE TABLE</codeph> syntax, you must list one or more columns, + its name, type, and optionally a comment, in addition to any columns used as partitioning keys. + There is one exception where the column list is not required: when creating an Avro table with the + <codeph>STORED AS AVRO</codeph> clause, you can omit the list of columns and specify the same metadata + as part of the <codeph>TBLPROPERTIES</codeph> clause. + </p> + <p conref="../shared/impala_common.xml#common/complex_types_blurb"/> <p rev="2.3.0"> @@ -220,11 +251,20 @@ file_format: partitioning, see <xref href="impala_partitioning.xml#partitioning"/>. </p> - <p rev="kudu" audience="impala_next"> + <p rev="2.5.0"> + Prior to CDH 5.7 / Impala 2.5, you could use a partitioned table + as the source and copy data from it, but could not specify any partitioning clauses for the new table. + In CDH 5.7 / Impala 2.5 and higher, you can now use the <codeph>PARTITIONED BY</codeph> clause with a + <codeph>CREATE TABLE AS SELECT</codeph> statement. See the examples under the following discussion of + the <codeph>CREATE TABLE AS SELECT</codeph> syntax variation. + </p> + +<!-- + <p rev="kudu"> <b>Partitioning for Kudu tables (DISTRIBUTE BY clause)</b> </p> - <p rev="kudu" audience="impala_next"> + <p rev="kudu"> For Kudu tables, you specify logical partitioning across one or more columns using the <codeph>DISTRIBUTE BY</codeph> clause. In contrast to partitioning for HDFS-based tables, multiple values for a partition key column can be located in the same partition. @@ -234,16 +274,17 @@ file_format: The optional <codeph>RANGE</codeph> clause further subdivides the partitions, based on a set of literal values for the partition key columns. </p> +--> <p> <b>Specifying file format (STORED AS and ROW FORMAT clauses):</b> </p> - <p> + <p rev="DOCS-1523"> The <codeph>STORED AS</codeph> clause identifies the format of the underlying data files. Currently, Impala can query more types of file formats than it can create or insert into. Use Hive to perform any create or - data load operations that are not currently available in Impala. For example, Impala can create a - SequenceFile table but cannot insert data into it. There are also Impala-specific procedures for using + data load operations that are not currently available in Impala. For example, Impala can create an Avro, + SequenceFile, or RCFile table but cannot insert data into it. There are also Impala-specific procedures for using compression with each kind of file format. For details about working with data files of various formats, see <xref href="impala_file_formats.xml#file_formats"/>. </p> @@ -299,9 +340,11 @@ file_format: [STORED AS <varname>file_format</varname>] [LOCATION '<varname>hdfs_path</varname>']</codeblock> - <note rev="1.2"> - To clone the structure of a table and transfer data into it in a single operation, use the <codeph>CREATE - TABLE AS SELECT</codeph> syntax described in the next subsection. + <note rev="1.2.0"> + <p rev="1.2.0"> + To clone the structure of a table and transfer data into it in a single operation, use the <codeph>CREATE + TABLE AS SELECT</codeph> syntax described in the next subsection. + </p> </note> <p> @@ -350,24 +393,78 @@ file_format: definitions from another table, and copy data from the source table to the destination table without issuing any separate <codeph>INSERT</codeph> statement. This idiom is so popular that it has its own acronym, <q>CTAS</q>. -<!-- - The <codeph>CREATE TABLE AS SELECT</codeph> syntax is as follows: - --> </p> -<!-- CREATE TABLE AS <select> now incorporated up higher in the original syntax diagram, - thus commented out here. - Does CTAS only accept a limited subset of clauses? --> + <p> + The following examples show how to copy data from a source table <codeph>T1</codeph> + to a variety of destinations tables, applying various transformations to the table + properties, table layout, or the data itself as part of the operation: + </p> -<!-- -<codeblock rev="1.2">CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname> - [COMMENT '<varname>table_comment</varname>'] - [STORED AS <varname>file_format</varname>] - [LOCATION '<varname>hdfs_path</varname>'] -AS - <varname>select_statement</varname></codeblock> +<codeblock> +-- Sample table to be the source of CTAS operations. +CREATE TABLE t1 (x INT, y STRING); +INSERT INTO t1 VALUES (1, 'one'), (2, 'two'), (3, 'three'); + +-- Clone all the columns and data from one table to another. +CREATE TABLE clone_of_t1 AS SELECT * FROM t1; ++-------------------+ +| summary | ++-------------------+ +| Inserted 3 row(s) | ++-------------------+ + +-- Clone the columns and data, and convert the data to a different file format. +CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1; ++-------------------+ +| summary | ++-------------------+ +| Inserted 3 row(s) | ++-------------------+ + +-- Copy only some rows to the new table. +CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x >= 2; ++-------------------+ +| summary | ++-------------------+ +| Inserted 2 row(s) | ++-------------------+ + +-- Same idea as CREATE TABLE LIKE: clone table layout but do not copy any data. +CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0; ++-------------------+ +| summary | ++-------------------+ +| Inserted 0 row(s) | ++-------------------+ + +-- Reorder and rename columns and transform the data. +CREATE TABLE t5 AS SELECT upper(y) AS s, x+1 AS a, 'Entirely new column' AS n FROM t1; ++-------------------+ +| summary | ++-------------------+ +| Inserted 3 row(s) | ++-------------------+ +SELECT * FROM t5; ++-------+---+---------------------+ +| s | a | n | ++-------+---+---------------------+ +| ONE | 2 | Entirely new column | +| TWO | 3 | Entirely new column | +| THREE | 4 | Entirely new column | ++-------+---+---------------------+ +</codeblock> + +<!-- These are a little heavyweight to get into here. Therefore commenting out. + Some overlap with the new column-changing examples in the code listing above. +Create tables with different column order, names, or types than the original. +CREATE TABLE some_columns_from_t1 AS SELECT c1, c3, c5 FROM t1; +CREATE TABLE reordered_columns_from_t1 AS SELECT c4, c3, c1, c2 FROM t1; +CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total, "California" AS state FROM t1;</codeblock> --> +<!-- CREATE TABLE AS <select> now incorporated up higher in the original syntax diagram. --> + <p rev="1.2"> See <xref href="impala_select.xml#select"/> for details about query syntax for the <codeph>SELECT</codeph> portion of a <codeph>CREATE TABLE AS SELECT</codeph> statement. @@ -379,34 +476,112 @@ AS not carried over to the new table. </p> - <p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/> + <note rev="DOCS-1523"> + When using the <codeph>STORED AS</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph> + statement, the destination table must be a file format that Impala can write to: currently, + text or Parquet. You cannot specify an Avro, SequenceFile, or RCFile table as the destination + table for a CTAS operation. + </note> + + <p rev="2.5.0"> + Prior to CDH 5.7 / Impala 2.5, you could use a partitioned table + as the source and copy data from it, but could not specify any partitioning clauses for the new table. + In CDH 5.7 / Impala 2.5 and higher, you can now use the <codeph>PARTITIONED BY</codeph> clause with a + <codeph>CREATE TABLE AS SELECT</codeph> statement. The following example demonstrates how you can copy + data from an unpartitioned table in a <codeph>CREATE TABLE AS SELECT</codeph> operation, creating a new + partitioned table in the process. The main syntax consideration is the column order in the <codeph>PARTITIONED BY</codeph> + clause and the select list: the partition key columns must be listed last in the select list, in the same + order as in the <codeph>PARTITIONED BY</codeph> clause. Therefore, in this case, the column order in the + destination table is different from the source table. You also only specify the column names in the + <codeph>PARTITIONED BY</codeph> clause, not the data types or column comments. + </p> + +<codeblock rev="2.5.0"> +create table partitions_no (year smallint, month tinyint, s string); +insert into partitions_no values (2016, 1, 'January 2016'), + (2016, 2, 'February 2016'), (2016, 3, 'March 2016'); + +-- Prove that the source table is not partitioned. +show partitions partitions_no; +ERROR: AnalysisException: Table is not partitioned: ctas_partition_by.partitions_no + +-- Create new table with partitions based on column values from source table. +<b>create table partitions_yes partitioned by (year, month) + as select s, year, month from partitions_no;</b> ++-------------------+ +| summary | ++-------------------+ +| Inserted 3 row(s) | ++-------------------+ + +-- Prove that the destination table is partitioned. +show partitions partitions_yes; ++-------+-------+-------+--------+------+... +| year | month | #Rows | #Files | Size |... ++-------+-------+-------+--------+------+... +| 2016 | 1 | -1 | 1 | 13B |... +| 2016 | 2 | -1 | 1 | 14B |... +| 2016 | 3 | -1 | 1 | 11B |... +| Total | | -1 | 3 | 38B |... ++-------+-------+-------+--------+------+... +</codeblock> + + <p rev="2.5.0"> + The most convenient layout for partitioned tables is with all the + partition key columns at the end. The CTAS <codeph>PARTITIONED BY</codeph> syntax + requires that column order in the select list, resulting in that same + column order in the destination table. + </p> + +<codeblock rev="2.5.0"> +describe partitions_no; ++-------+----------+---------+ +| name | type | comment | ++-------+----------+---------+ +| year | smallint | | +| month | tinyint | | +| s | string | | ++-------+----------+---------+ + +-- The CTAS operation forced us to put the partition key columns last. +-- Having those columns last works better with idioms such as SELECT * +-- for partitioned tables. +describe partitions_yes; ++-------+----------+---------+ +| name | type | comment | ++-------+----------+---------+ +| s | string | | +| year | smallint | | +| month | tinyint | | ++-------+----------+---------+ +</codeblock> + + <p rev="2.5.0"> + Attempting to use a select list with the partition key columns + not at the end results in an error due to a column name mismatch: + </p> + +<codeblock rev="2.5.0"> +-- We expect this CTAS to fail because non-key column S +-- comes after key columns YEAR and MONTH in the select list. +create table partitions_maybe partitioned by (year, month) + as select year, month, s from partitions_no; +ERROR: AnalysisException: Partition column name mismatch: year != month +</codeblock> <p rev="1.2"> For example, the following statements show how you can clone all the data in a table, or a subset of the columns and/or rows, or reorder columns, rename them, or construct them out of expressions: </p> -<codeblock rev="1.2">-- Create new table and copy all data. -CREATE TABLE clone_of_t1 AS SELECT * FROM t1; --- Same idea as CREATE TABLE LIKE, don't copy any data. -CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0; --- Copy some data. -CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x > 100 AND y LIKE 'A%'; -CREATE TABLE summary_of_t1 AS SELECT c1, sum(c2) AS total, avg(c2) AS average FROM t1 GROUP BY c2; --- Switch file format. -CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1; --- Create tables with different column order, names, or types than the original. -CREATE TABLE some_columns_from_t1 AS SELECT c1, c3, c5 FROM t1; -CREATE TABLE reordered_columns_from_t1 AS SELECT c4, c3, c1, c2 FROM t1; -CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total, "California" AS state FROM t1;</codeblock> - <p rev="1.2"> As part of a CTAS operation, you can convert the data to any file format that Impala can write (currently, <codeph>TEXTFILE</codeph> and <codeph>PARQUET</codeph>). You cannot specify the lower-level properties of a - text table, such as the delimiter. Although you can use a partitioned table as the source and copy data from - it, you cannot specify any partitioning clauses for the new table. + text table, such as the delimiter. </p> + <p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/> + <p rev="1.4.0"> <b>CREATE TABLE LIKE PARQUET:</b> </p> @@ -601,11 +776,16 @@ CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total <p rev="2.2.0"> To create a table where the data resides in the Amazon Simple Storage Service (S3), specify a <codeph>s3a://</codeph> prefix <codeph>LOCATION</codeph> attribute pointing to the data files in S3. - You can use this special <codeph>LOCATION</codeph> syntax when creating an empty table, - but not as part of a <codeph>CREATE TABLE AS SELECT</codeph> statement. - See <xref href="impala_s3.xml#s3"/> for details. </p> + <p rev="2.6.0 CDH-39913 IMPALA-1878"> + In CDH 5.8 / Impala 2.6 and higher, you can + use this special <codeph>LOCATION</codeph> syntax + as part of a <codeph>CREATE TABLE AS SELECT</codeph> statement. + </p> + + <p conref="../shared/impala_common.xml#common/s3_ddl"/> + <p conref="../shared/impala_common.xml#common/insert_sort_blurb"/> <p conref="../shared/impala_common.xml#common/hdfs_blurb"/> @@ -628,7 +808,9 @@ CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total <p conref="../shared/impala_common.xml#common/permissions_blurb"/> <p rev="CDH-19187"> - <!-- TBD. --> + The user ID that the <cmdname>impalad</cmdname> daemon runs under, + typically the <codeph>impala</codeph> user, must have both execute and write + permission for the database directory where the table is being created. </p> <p conref="../shared/impala_common.xml#common/security_blurb"/> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_create_view.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_create_view.xml b/docs/topics/impala_create_view.xml index 2458279..446acf2 100644 --- a/docs/topics/impala_create_view.xml +++ b/docs/topics/impala_create_view.xml @@ -3,7 +3,7 @@ <concept rev="1.1" id="create_view"> <title>CREATE VIEW Statement</title> - <titlealts><navtitle>CREATE VIEW</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>CREATE VIEW</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -11,6 +11,9 @@ <data name="Category" value="DDL"/> <data name="Category" value="Tables"/> <data name="Category" value="Schemas"/> + <data name="Category" value="Views"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_databases.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_databases.xml b/docs/topics/impala_databases.xml index ad0511f..3253e75 100644 --- a/docs/topics/impala_databases.xml +++ b/docs/topics/impala_databases.xml @@ -3,7 +3,7 @@ <concept id="databases"> <title>Overview of Impala Databases</title> - <titlealts><navtitle>Databases</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>Databases</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/>
