http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_block_mgr_memory.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_max_block_mgr_memory.xml b/docs/topics/impala_max_block_mgr_memory.xml new file mode 100644 index 0000000..3bf8ac8 --- /dev/null +++ b/docs/topics/impala_max_block_mgr_memory.xml @@ -0,0 +1,30 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept rev="2.1.0" id="max_block_mgr_memory"> + + <title>MAX_BLOCK_MGR_MEMORY</title> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Memory"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p rev="2.1.0"> + <indexterm audience="Cloudera">MAX_BLOCK_MGR_MEMORY query option</indexterm> + </p> + + <p></p> + + <p> + <b>Default:</b> + </p> + + <p conref="../shared/impala_common.xml#common/added_in_20"/> + </conbody> +</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_errors.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_max_errors.xml b/docs/topics/impala_max_errors.xml new file mode 100644 index 0000000..c6eb971 --- /dev/null +++ b/docs/topics/impala_max_errors.xml @@ -0,0 +1,47 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="max_errors"> + + <title>MAX_ERRORS Query Option</title> + <titlealts audience="PDF"><navtitle>MAX_ERRORS</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Troubleshooting"/> + <data name="Category" value="Logs"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">MAX_ERRORS query option</indexterm> + Maximum number of non-fatal errors for any particular query that are recorded in the Impala log file. For + example, if a billion-row table had a non-fatal data error in every row, you could diagnose the problem + without all billion errors being logged. Unspecified or 0 indicates the built-in default value of 1000. + </p> + + <p> + This option only controls how many errors are reported. To specify whether Impala continues or halts when it + encounters such errors, use the <codeph>ABORT_ON_ERROR</codeph> option. + </p> + + <p> + <b>Type:</b> numeric + </p> + + <p> + <b>Default:</b> 0 (meaning 1000 errors) + </p> + + <p conref="../shared/impala_common.xml#common/related_info"/> + <p> + <xref href="impala_abort_on_error.xml#abort_on_error"/>, + <xref href="impala_logging.xml#logging"/> + </p> + + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_io_buffers.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_max_io_buffers.xml b/docs/topics/impala_max_io_buffers.xml new file mode 100644 index 0000000..477c833 --- /dev/null +++ b/docs/topics/impala_max_io_buffers.xml @@ -0,0 +1,31 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="max_io_buffers"> + + <title>MAX_IO_BUFFERS Query Option</title> + <titlealts audience="PDF"><navtitle>MAX_IO_BUFFERS</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Deprecated Features"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p> + Deprecated query option. Currently has no effect. + </p> + + <p> + <b>Type:</b> numeric + </p> + + <p> + <b>Default:</b> 0 + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_num_runtime_filters.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_max_num_runtime_filters.xml b/docs/topics/impala_max_num_runtime_filters.xml new file mode 100644 index 0000000..edf4058 --- /dev/null +++ b/docs/topics/impala_max_num_runtime_filters.xml @@ -0,0 +1,61 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="max_num_runtime_filters" rev="2.5.0"> + + <title>MAX_NUM_RUNTIME_FILTERS Query Option (<keyword keyref="impala25"/> or higher only)</title> + <titlealts audience="PDF"><navtitle>MAX_NUM_RUNTIME_FILTERS</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Performance"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p rev="2.5.0"> + <indexterm audience="Cloudera">MAX_NUM_RUNTIME_FILTERS query option</indexterm> + The <codeph>MAX_NUM_RUNTIME_FILTERS</codeph> query option + sets an upper limit on the number of runtime filters that can be produced for each query. + </p> + + <p conref="../shared/impala_common.xml#common/type_integer"/> + + <p> + <b>Default:</b> 10 + </p> + + <p conref="../shared/impala_common.xml#common/added_in_250"/> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + Each runtime filter imposes some memory overhead on the query. + Depending on the setting of the <codeph>RUNTIME_BLOOM_FILTER_SIZE</codeph> + query option, each filter might consume between 1 and 16 megabytes + per plan fragment. There are typically 5 or fewer filters per plan fragment. + </p> + + <p> + Impala evaluates the effectiveness of each filter, and keeps the + ones that eliminate the largest number of partitions or rows. + Therefore, this setting can protect against + potential problems due to excessive memory overhead for filter production, + while still allowing a high level of optimization for suitable queries. + </p> + + <p conref="../shared/impala_common.xml#common/runtime_filtering_option_caveat"/> + + <p conref="../shared/impala_common.xml#common/related_info"/> + <p> + <xref href="impala_runtime_filtering.xml"/>, + <!-- <xref href="impala_partitioning.xml#dynamic_partition_pruning"/>, --> + <xref href="impala_runtime_bloom_filter_size.xml#runtime_bloom_filter_size"/>, + <xref href="impala_runtime_filter_mode.xml#runtime_filter_mode"/> + </p> + + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_max_scan_range_length.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_max_scan_range_length.xml b/docs/topics/impala_max_scan_range_length.xml new file mode 100644 index 0000000..a99b92b --- /dev/null +++ b/docs/topics/impala_max_scan_range_length.xml @@ -0,0 +1,55 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="max_scan_range_length"> + + <title>MAX_SCAN_RANGE_LENGTH Query Option</title> + <titlealts audience="PDF"><navtitle>MAX_SCAN_RANGE_LENGTH</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Performance"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">MAX_SCAN_RANGE_LENGTH query option</indexterm> + Maximum length of the scan range. Interacts with the number of HDFS blocks in the table to determine how many + CPU cores across the cluster are involved with the processing for a query. (Each core processes one scan + range.) + </p> + + <p> + Lowering the value can sometimes increase parallelism if you have unused CPU capacity, but a too-small value + can limit query performance because each scan range involves extra overhead. + </p> + + <p> + Only applicable to HDFS tables. Has no effect on Parquet tables. Unspecified or 0 indicates backend default, + which is the same as the HDFS block size for each table. + </p> + + <p> + Although the scan range can be arbitrarily long, Impala internally uses an 8 MB read buffer so that it can + query tables with huge block sizes without allocating equivalent blocks of memory. + </p> + + <p> + <b>Type:</b> numeric + </p> + + <p rev="IMPALA-3799 CDH-43746"> + In <keyword keyref="impala27_full"/> and higher, the argument value can include unit specifiers, + such as <codeph>100m</codeph> or <codeph>100mb</codeph>. In previous versions, + Impala interpreted such formatted values as 0, leading to query failures. + </p> + + <p> + <b>Default:</b> 0 + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_mem_limit.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_mem_limit.xml b/docs/topics/impala_mem_limit.xml new file mode 100644 index 0000000..8ed6fbe --- /dev/null +++ b/docs/topics/impala_mem_limit.xml @@ -0,0 +1,212 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="mem_limit"> + + <title>MEM_LIMIT Query Option</title> + <titlealts audience="PDF"><navtitle>MEM_LIMIT</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Scalability"/> + <data name="Category" value="Memory"/> + <data name="Category" value="Troubleshooting"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">MEM_LIMIT query option</indexterm> + When resource management is not enabled, defines the maximum amount of memory a query can allocate on each node. + Therefore, the total memory that can be used by a query is the <codeph>MEM_LIMIT</codeph> times the number of nodes. + </p> + + <p rev="CDH-32135"> + There are two levels of memory limit for Impala. + The <codeph>-mem_limit</codeph> startup option sets an overall limit for the <cmdname>impalad</cmdname> process + (which handles multiple queries concurrently). + That limit is typically expressed in terms of a percentage of the RAM available on the host, such as <codeph>-mem_limit=70%</codeph>. + The <codeph>MEM_LIMIT</codeph> query option, which you set through <cmdname>impala-shell</cmdname> + or the <codeph>SET</codeph> statement in a JDBC or ODBC application, applies to each individual query. + The <codeph>MEM_LIMIT</codeph> query option is usually expressed as a fixed size such as <codeph>10gb</codeph>, + and must always be less than the <cmdname>impalad</cmdname> memory limit. + </p> + + <p rev="CDH-32135"> + If query processing exceeds the specified memory limit on any node, either the per-query limit or the + <cmdname>impalad</cmdname> limit, Impala cancels the query automatically. + Memory limits are checked periodically during query processing, so the actual memory in use + might briefly exceed the limit without the query being cancelled. + </p> + + <p> + When resource management is enabled in CDH 5, the mechanism for this option changes. If set, it overrides the + automatic memory estimate from Impala. Impala requests this amount of memory from YARN on each node, and the + query does not proceed until that much memory is available. The actual memory used by the query could be + lower, since some queries use much less memory than others. With resource management, the + <codeph>MEM_LIMIT</codeph> setting acts both as a hard limit on the amount of memory a query can use on any + node (enforced by YARN) and a guarantee that that much memory will be available on each node while the query + is being executed. When resource management is enabled but no <codeph>MEM_LIMIT</codeph> setting is + specified, Impala estimates the amount of memory needed on each node for each query, requests that much + memory from YARN before starting the query, and then internally sets the <codeph>MEM_LIMIT</codeph> on each + node to the requested amount of memory during the query. Thus, if the query takes more memory than was + originally estimated, Impala detects that the <codeph>MEM_LIMIT</codeph> is exceeded and cancels the query + itself. + </p> + + <p> + <b>Type:</b> numeric + </p> + + <p rev="CDH-32135"> + <b>Units:</b> A numeric argument represents memory size in bytes; you can also use a suffix of <codeph>m</codeph> or <codeph>mb</codeph> + for megabytes, or more commonly <codeph>g</codeph> or <codeph>gb</codeph> for gigabytes. If you specify a value with unrecognized + formats, subsequent queries fail with an error. + </p> + + <p rev="CDH-32135"> + <b>Default:</b> 0 (unlimited) + </p> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p rev="CDH-32135"> + The <codeph>MEM_LIMIT</codeph> setting is primarily useful in a high-concurrency setting, + or on a cluster with a workload shared between Impala and other data processing components. + You can prevent any query from accidentally using much more memory than expected, + which could negatively impact other Impala queries. + </p> + + <p rev="CDH-32135"> + Use the output of the <codeph>SUMMARY</codeph> command in <cmdname>impala-shell</cmdname> + to get a report of memory used for each phase of your most heavyweight queries on each node, + and then set a <codeph>MEM_LIMIT</codeph> somewhat higher than that. + See <xref href="impala_explain_plan.xml#perf_summary"/> for usage information about + the <codeph>SUMMARY</codeph> command. + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb" rev="CDH-32135"/> + + <p rev="CDH-32135"> + The following examples show how to set the <codeph>MEM_LIMIT</codeph> query option + using a fixed number of bytes, or suffixes representing gigabytes or megabytes. + </p> + +<codeblock rev="CDH-32135"> +[localhost:21000] > set mem_limit=3000000000; +MEM_LIMIT set to 3000000000 +[localhost:21000] > select 5; +Query: select 5 ++---+ +| 5 | ++---+ +| 5 | ++---+ + +[localhost:21000] > set mem_limit=3g; +MEM_LIMIT set to 3g +[localhost:21000] > select 5; +Query: select 5 ++---+ +| 5 | ++---+ +| 5 | ++---+ + +[localhost:21000] > set mem_limit=3gb; +MEM_LIMIT set to 3gb +[localhost:21000] > select 5; ++---+ +| 5 | ++---+ +| 5 | ++---+ + +[localhost:21000] > set mem_limit=3m; +MEM_LIMIT set to 3m +[localhost:21000] > select 5; ++---+ +| 5 | ++---+ +| 5 | ++---+ +[localhost:21000] > set mem_limit=3mb; +MEM_LIMIT set to 3mb +[nightly55-2.vpc.cloudera.com:21000] > select 5; ++---+ +| 5 | ++---+ +</codeblock> + + <p rev="CDH-32135"> + The following examples show how unrecognized <codeph>MEM_LIMIT</codeph> + values lead to errors for subsequent queries. + </p> + +<codeblock rev="CDH-32135"> +[localhost:21000] > set mem_limit=3tb; +MEM_LIMIT set to 3tb +[localhost:21000] > select 5; +ERROR: Failed to parse query memory limit from '3tb'. + +[localhost:21000] > set mem_limit=xyz; +MEM_LIMIT set to xyz +[localhost:21000] > select 5; +Query: select 5 +ERROR: Failed to parse query memory limit from 'xyz'. +</codeblock> + + <p rev="CDH-32135"> + The following examples shows the automatic query cancellation + when the <codeph>MEM_LIMIT</codeph> value is exceeded + on any host involved in the Impala query. First it runs a + successful query and checks the largest amount of memory + used on any node for any stage of the query. + Then it sets an artificially low <codeph>MEM_LIMIT</codeph> + setting so that the same query cannot run. + </p> + +<codeblock rev="CDH-32135"> +[localhost:21000] > select count(*) from customer; +Query: select count(*) from customer ++----------+ +| count(*) | ++----------+ +| 150000 | ++----------+ + +[localhost:21000] > select count(distinct c_name) from customer; +Query: select count(distinct c_name) from customer ++------------------------+ +| count(distinct c_name) | ++------------------------+ +| 150000 | ++------------------------+ + +[localhost:21000] > summary; ++--------------+--------+----------+----------+---------+------------+----------+---------------+---------------+ +| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail | ++--------------+--------+----------+----------+---------+------------+----------+---------------+---------------+ +| 06:AGGREGATE | 1 | 230.00ms | 230.00ms | 1 | 1 | 16.00 KB | -1 B | FINALIZE | +| 05:EXCHANGE | 1 | 43.44us | 43.44us | 1 | 1 | 0 B | -1 B | UNPARTITIONED | +| 02:AGGREGATE | 1 | 227.14ms | 227.14ms | 1 | 1 | 12.00 KB | 10.00 MB | | +| 04:AGGREGATE | 1 | 126.27ms | 126.27ms | 150.00K | 150.00K | 15.17 MB | 10.00 MB | | +| 03:EXCHANGE | 1 | 44.07ms | 44.07ms | 150.00K | 150.00K | 0 B | 0 B | HASH(c_name) | +<b>| 01:AGGREGATE | 1 | 361.94ms | 361.94ms | 150.00K | 150.00K | 23.04 MB | 10.00 MB | |</b> +| 00:SCAN HDFS | 1 | 43.64ms | 43.64ms | 150.00K | 150.00K | 24.19 MB | 64.00 MB | tpch.customer | ++--------------+--------+----------+----------+---------+------------+----------+---------------+---------------+ + +[localhost:21000] > set mem_limit=15mb; +MEM_LIMIT set to 15mb +[localhost:21000] > select count(distinct c_name) from customer; +Query: select count(distinct c_name) from customer +ERROR: +Memory limit exceeded +Query did not have enough memory to get the minimum required buffers in the block manager. +</codeblock> + + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_min.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_min.xml b/docs/topics/impala_min.xml new file mode 100644 index 0000000..047b4ef --- /dev/null +++ b/docs/topics/impala_min.xml @@ -0,0 +1,197 @@ +<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="min"> + + <title>MIN Function</title> + <titlealts audience="PDF"><navtitle>MIN</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Impala Functions"/> + <data name="Category" value="Analytic Functions"/> + <data name="Category" value="Aggregate Functions"/> + <data name="Category" value="Querying"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">min() function</indexterm> + An aggregate function that returns the minimum value from a set of numbers. Opposite of the + <codeph>MAX</codeph> function. Its single argument can be numeric column, or the numeric result of a function + or expression applied to the column value. Rows with a <codeph>NULL</codeph> value for the specified column + are ignored. If the table is empty, or all the values supplied to <codeph>MIN</codeph> are + <codeph>NULL</codeph>, <codeph>MIN</codeph> returns <codeph>NULL</codeph>. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>MIN([DISTINCT | ALL] <varname>expression</varname>) [OVER (<varname>analytic_clause</varname>)]</codeblock> + + <p> + When the query contains a <codeph>GROUP BY</codeph> clause, returns one value for each combination of + grouping values. + </p> + + <p conref="../shared/impala_common.xml#common/restrictions_sliding_window"/> + + <p conref="../shared/impala_common.xml#common/return_type_same_except_string"/> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p conref="../shared/impala_common.xml#common/partition_key_optimization"/> + + <p conref="../shared/impala_common.xml#common/complex_types_blurb"/> + + <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/> + + <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + +<codeblock>-- Find the smallest value for this column in the table. +select min(c1) from t1; +-- Find the smallest value for this column from a subset of the table. +select min(c1) from t1 where month = 'January' and year = '2013'; +-- Find the smallest value from a set of numeric function results. +select min(length(s)) from t1; +-- Can also be used in combination with DISTINCT and/or GROUP BY. +-- Return more than one result. +select month, year, min(purchase_price) from store_stats group by month, year; +-- Filter the input to eliminate duplicates before performing the calculation. +select min(distinct x) from t1; +</codeblock> + + <p rev="2.0.0"> + The following examples show how to use <codeph>MIN()</codeph> in an analytic context. They use a table + containing integers from 1 to 10. Notice how the <codeph>MIN()</codeph> is reported for each input value, as + opposed to the <codeph>GROUP BY</codeph> clause which condenses the result set. +<codeblock>select x, property, min(x) over (partition by property) as min from int_t where property in ('odd','even'); ++----+----------+-----+ +| x | property | min | ++----+----------+-----+ +| 2 | even | 2 | +| 4 | even | 2 | +| 6 | even | 2 | +| 8 | even | 2 | +| 10 | even | 2 | +| 1 | odd | 1 | +| 3 | odd | 1 | +| 5 | odd | 1 | +| 7 | odd | 1 | +| 9 | odd | 1 | ++----+----------+-----+ +</codeblock> + +Adding an <codeph>ORDER BY</codeph> clause lets you experiment with results that are cumulative or apply to a moving +set of rows (the <q>window</q>). The following examples use <codeph>MIN()</codeph> in an analytic context +(that is, with an <codeph>OVER()</codeph> clause) to display the smallest value of <codeph>X</codeph> +encountered up to each row in the result set. The examples use two columns in the <codeph>ORDER BY</codeph> +clause to produce a sequence of values that rises and falls, to illustrate how the <codeph>MIN()</codeph> +result only decreases or stays the same throughout each partition within the result set. +The basic <codeph>ORDER BY x</codeph> clause implicitly +activates a window clause of <codeph>RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>, +which is effectively the same as <codeph>ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW</codeph>, +therefore all of these examples produce the same results: + +<codeblock>select x, property, min(x) <b>over (order by property, x desc)</b> as 'minimum to this point' + from int_t where property in ('prime','square'); ++---+----------+-----------------------+ +| x | property | minimum to this point | ++---+----------+-----------------------+ +| 7 | prime | 7 | +| 5 | prime | 5 | +| 3 | prime | 3 | +| 2 | prime | 2 | +| 9 | square | 2 | +| 4 | square | 2 | +| 1 | square | 1 | ++---+----------+-----------------------+ + +select x, property, + min(x) over + ( + <b>order by property, x desc</b> + <b>range between unbounded preceding and current row</b> + ) as 'minimum to this point' +from int_t where property in ('prime','square'); ++---+----------+-----------------------+ +| x | property | minimum to this point | ++---+----------+-----------------------+ +| 7 | prime | 7 | +| 5 | prime | 5 | +| 3 | prime | 3 | +| 2 | prime | 2 | +| 9 | square | 2 | +| 4 | square | 2 | +| 1 | square | 1 | ++---+----------+-----------------------+ + +select x, property, + min(x) over + ( + <b>order by property, x desc</b> + <b>rows between unbounded preceding and current row</b> + ) as 'minimum to this point' +from int_t where property in ('prime','square'); ++---+----------+-----------------------+ +| x | property | minimum to this point | ++---+----------+-----------------------+ +| 7 | prime | 7 | +| 5 | prime | 5 | +| 3 | prime | 3 | +| 2 | prime | 2 | +| 9 | square | 2 | +| 4 | square | 2 | +| 1 | square | 1 | ++---+----------+-----------------------+ +</codeblock> + +The following examples show how to construct a moving window, with a running minimum taking into account all rows before +and 1 row after the current row. +Because of a restriction in the Impala <codeph>RANGE</codeph> syntax, this type of +moving window is possible with the <codeph>ROWS BETWEEN</codeph> clause but not the <codeph>RANGE BETWEEN</codeph> clause. +Because of an extra Impala restriction on the <codeph>MAX()</codeph> and <codeph>MIN()</codeph> functions in an +analytic context, the lower bound must be <codeph>UNBOUNDED PRECEDING</codeph>. +<codeblock>select x, property, + min(x) over + ( + <b>order by property, x desc</b> + <b>rows between unbounded preceding and 1 following</b> + ) as 'local minimum' +from int_t where property in ('prime','square'); ++---+----------+---------------+ +| x | property | local minimum | ++---+----------+---------------+ +| 7 | prime | 5 | +| 5 | prime | 3 | +| 3 | prime | 2 | +| 2 | prime | 2 | +| 9 | square | 2 | +| 4 | square | 1 | +| 1 | square | 1 | ++---+----------+---------------+ + +-- Doesn't work because of syntax restriction on RANGE clause. +select x, property, + min(x) over + ( + <b>order by property, x desc</b> + <b>range between unbounded preceding and 1 following</b> + ) as 'local minimum' +from int_t where property in ('prime','square'); +ERROR: AnalysisException: RANGE is only supported with both the lower and upper bounds UNBOUNDED or one UNBOUNDED and the other CURRENT ROW. +</codeblock> + </p> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_analytic_functions.xml#analytic_functions"/>, <xref href="impala_max.xml#max"/>, + <xref href="impala_avg.xml#avg"/> + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_misc_functions.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_misc_functions.xml b/docs/topics/impala_misc_functions.xml new file mode 100644 index 0000000..196f3c2 --- /dev/null +++ b/docs/topics/impala_misc_functions.xml @@ -0,0 +1,196 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="misc_functions"> + + <title>Impala Miscellaneous Functions</title> + <titlealts audience="PDF"><navtitle>Miscellaneous Functions</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Functions"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Data Analysts"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Querying"/> + </metadata> + </prolog> + + <conbody> + + <p> + Impala supports the following utility functions that do not operate on a particular column or data type: + </p> + + <dl> + <dlentry rev="1.3.0" id="current_database"> + + <dt> + <codeph>current_database()</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">current_database() function</indexterm> + <b>Purpose:</b> Returns the database that the session is currently using, either <codeph>default</codeph> + if no database has been selected, or whatever database the session switched to through a + <codeph>USE</codeph> statement or the <cmdname>impalad</cmdname><codeph>-d</codeph> option. + <p> + <b>Return type:</b> <codeph>string</codeph> + </p> + </dd> + + </dlentry> + + <dlentry rev="5.4.5" id="effective_user"> + + <dt> + <codeph>effective_user()</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">effective_user() function</indexterm> + <b>Purpose:</b> Typically returns the same value as <codeph>user()</codeph>, + except if delegation is enabled, in which case it returns the ID of the delegated user. + <p> + <b>Return type:</b> <codeph>string</codeph> + </p> + <p> + <b>Added in:</b> CDH 5.4.5 + </p> + </dd> + + </dlentry> + + <dlentry rev="1.3.0" id="pid"> + + <dt> + <codeph>pid()</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">pid() function</indexterm> + <b>Purpose:</b> Returns the process ID of the <cmdname>impalad</cmdname> daemon that the session is + connected to. You can use it during low-level debugging, to issue Linux commands that trace, show the + arguments, and so on the <cmdname>impalad</cmdname> process. + <p> + <b>Return type:</b> <codeph>int</codeph> + </p> + </dd> + + </dlentry> + + <dlentry audience="Cloudera" id="sleep"> + + <dt> + <codeph>sleep(int ms)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">sleep() function</indexterm> + <b>Purpose:</b> Pauses the query for a specified number of milliseconds. For slowing down queries with + small result sets enough to monitor runtime execution, memory usage, or other factors that otherwise + would be difficult to capture during the brief interval of query execution. When used in the + <codeph>SELECT</codeph> list, it is called once for each row in the result set; adjust the number of + milliseconds accordingly. For example, a query <codeph>SELECT *, sleep(5) FROM + table_with_1000_rows</codeph> would take at least 5 seconds to complete (5 milliseconds * 1000 rows in + result set). To avoid an excessive number of concurrent queries, use this function for troubleshooting on + test and development systems, not for production queries. + <p> + <b>Return type:</b> N/A + </p> + </dd> + + </dlentry> + + <dlentry rev="1.1" id="user"> + + <dt> + <codeph>user()</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">user() function</indexterm> + <b>Purpose:</b> Returns the username of the Linux user who is connected to the <cmdname>impalad</cmdname> + daemon. Typically called a single time, in a query without any <codeph>FROM</codeph> clause, to + understand how authorization settings apply in a security context; once you know the logged-in username, + you can check which groups that user belongs to, and from the list of groups you can check which roles + are available to those groups through the authorization policy file. + <p conref="../shared/impala_common.xml#common/user_kerberized"/> + <p> + When delegation is enabled, consider calling the <codeph>effective_user()</codeph> function instead. + </p> + <p> + <b>Return type:</b> <codeph>string</codeph> + </p> + </dd> + + </dlentry> + + <dlentry rev="2.5.0 IMPALA-1477" id="uuid"> + + <dt> + <codeph>uuid()</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">uuid() function</indexterm> + <b>Purpose:</b> Returns a <xref href="https://en.wikipedia.org/wiki/Universally_unique_identifier" scope="external" format="html">universal unique identifier</xref>, a 128-bit value encoded as a string with groups of hexadecimal digits separated by dashes. + <p> + <b>Return type:</b> <codeph>string</codeph> + </p> + <p conref="../shared/impala_common.xml#common/added_in_250"/> + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + <p> + Ascending numeric sequences of type <codeph>BIGINT</codeph> are often used + as identifiers within a table, and as join keys across multiple tables. + The <codeph>uuid()</codeph> value is a convenient alternative that does not + require storing or querying the highest sequence number. For example, you + can use it to quickly construct new unique identifiers during a data import job, + or to combine data from different tables without the likelihood of ID collisions. + </p> + <p conref="../shared/impala_common.xml#common/example_blurb"/> +<codeblock> +-- Each call to uuid() produces a new arbitrary value. +select uuid(); ++--------------------------------------+ +| uuid() | ++--------------------------------------+ +| c7013e25-1455-457f-bf74-a2046e58caea | ++--------------------------------------+ + +-- If you get a UUID for each row of a result set, you can use it as a +-- unique identifier within a table, or even a unique ID across tables. +select uuid() from four_row_table; ++--------------------------------------+ +| uuid() | ++--------------------------------------+ +| 51d3c540-85e5-4cb9-9110-604e53999e2e | +| 0bb40071-92f6-4a59-a6a4-60d46e9703e2 | +| 5e9d7c36-9842-4a96-862d-c13cd0457c02 | +| cae29095-0cc0-4053-a5ea-7fcd3c780861 | ++--------------------------------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="version"> + + <dt> + <codeph>version()</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">version() function</indexterm> + <b>Purpose:</b> Returns information such as the precise version number and build date for the + <codeph>impalad</codeph> daemon that you are currently connected to. Typically used to confirm that you + are connected to the expected level of Impala to use a particular feature, or to connect to several nodes + and confirm they are all running the same level of <cmdname>impalad</cmdname>. + <p> + <b>Return type:</b> <codeph>string</codeph> (with one or more embedded newlines) + </p> + </dd> + + </dlentry> + </dl> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_mixed_security.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_mixed_security.xml b/docs/topics/impala_mixed_security.xml new file mode 100644 index 0000000..b9e6933 --- /dev/null +++ b/docs/topics/impala_mixed_security.xml @@ -0,0 +1,46 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="mixed_security"> + + <title>Using Multiple Authentication Methods with Impala</title> + <prolog> + <metadata> + <data name="Category" value="Security"/> + <data name="Category" value="Impala"/> + <data name="Category" value="Authentication"/> + <data name="Category" value="Kerberos"/> + <data name="Category" value="LDAP"/> + <data name="Category" value="Administrators"/> + </metadata> + </prolog> + + <conbody> + + <p> + Impala 2.0 and later automatically handles both Kerberos and LDAP authentication. Each + <cmdname>impalad</cmdname> daemon can accept both Kerberos and LDAP requests through the same port. No + special actions need to be taken if some users authenticate through Kerberos and some through LDAP. + </p> + + <p> + Prior to Impala 2.0, you had to configure each <cmdname>impalad</cmdname> to listen on a specific port + depending on the kind of authentication, then configure your network load balancer to forward each kind of + request to a DataNode that was set up with the appropriate authentication type. Once the initial request was + made using either Kerberos or LDAP authentication, Impala automatically handled the process of coordinating + the work across multiple nodes and transmitting intermediate results back to the coordinator node. + </p> + +<!-- + <p> + This technique is most suitable for larger clusters, where + you are already using load balancing software for high availability. + You configure Impala to run on a different port on the nodes configured for LDAP. + Then you configure the load balancing software to forward Kerberos + connection requests to nodes using the default port, and LDAP connection requests + to nodes using an alternative port for LDAP. + Consult the documentation for your load balancing software for how to + configure that type of forwarding. + </p> +--> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_ndv.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_ndv.xml b/docs/topics/impala_ndv.xml new file mode 100644 index 0000000..fcdae77 --- /dev/null +++ b/docs/topics/impala_ndv.xml @@ -0,0 +1,135 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept rev="1.2.1" id="ndv"> + + <title>NDV Function</title> + <titlealts audience="PDF"><navtitle>NDV</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Impala Functions"/> + <data name="Category" value="Aggregate Functions"/> + <data name="Category" value="Querying"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">ndv() function</indexterm> + An aggregate function that returns an approximate value similar to the result of <codeph>COUNT(DISTINCT + <varname>col</varname>)</codeph>, the <q>number of distinct values</q>. It is much faster than the + combination of <codeph>COUNT</codeph> and <codeph>DISTINCT</codeph>, and uses a constant amount of memory and + thus is less memory-intensive for columns with high cardinality. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>NDV([DISTINCT | ALL] <varname>expression</varname>)</codeblock> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p rev="1.2.2"> + This is the mechanism used internally by the <codeph>COMPUTE STATS</codeph> statement for computing the + number of distinct values in a column. + </p> + + <p> + Because this number is an estimate, it might not reflect the precise number of different values in the + column, especially if the cardinality is very low or very high. If the estimated number is higher than the + number of rows in the table, Impala adjusts the value internally during query planning. + </p> + + <p conref="../shared/impala_common.xml#common/former_odd_return_type_string"/> + +<!-- <p conref="../shared/impala_common.xml#common/restrictions_sliding_window"/> --> + + <p conref="../shared/impala_common.xml#common/complex_types_blurb"/> + + <p conref="../shared/impala_common.xml#common/complex_types_aggregation_explanation"/> + + <p conref="../shared/impala_common.xml#common/complex_types_aggregation_example"/> + + <p conref="../shared/impala_common.xml#common/restrictions_blurb"/> + + <p conref="../shared/impala_common.xml#common/analytic_not_allowed_caveat"/> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + + <p> + The following example queries a billion-row table to illustrate the relative performance of + <codeph>COUNT(DISTINCT)</codeph> and <codeph>NDV()</codeph>. It shows how <codeph>COUNT(DISTINCT)</codeph> + gives a precise answer, but is inefficient for large-scale data where an approximate result is sufficient. + The <codeph>NDV()</codeph> function gives an approximate result but is much faster. + </p> + +<codeblock>select count(distinct col1) from sample_data; ++---------------------+ +| count(distinct col1)| ++---------------------+ +| 100000 | ++---------------------+ +Fetched 1 row(s) in 20.13s + +select cast(ndv(col1) as bigint) as col1 from sample_data; ++----------+ +| col1 | ++----------+ +| 139017 | ++----------+ +Fetched 1 row(s) in 8.91s +</codeblock> + + <p> + The following example shows how you can code multiple <codeph>NDV()</codeph> calls in a single query, to + easily learn which columns have substantially more or fewer distinct values. This technique is faster than + running a sequence of queries with <codeph>COUNT(DISTINCT)</codeph> calls. + </p> + +<codeblock>select cast(ndv(col1) as bigint) as col1, cast(ndv(col2) as bigint) as col2, + cast(ndv(col3) as bigint) as col3, cast(ndv(col4) as bigint) as col4 + from sample_data; ++----------+-----------+------------+-----------+ +| col1 | col2 | col3 | col4 | ++----------+-----------+------------+-----------+ +| 139017 | 282 | 46 | 145636240 | ++----------+-----------+------------+-----------+ +Fetched 1 row(s) in 34.97s + +select count(distinct col1) from sample_data; ++---------------------+ +| count(distinct col1)| ++---------------------+ +| 100000 | ++---------------------+ +Fetched 1 row(s) in 20.13s + +select count(distinct col2) from sample_data; ++----------------------+ +| count(distinct col2) | ++----------------------+ +| 278 | ++----------------------+ +Fetched 1 row(s) in 20.09s + +select count(distinct col3) from sample_data; ++-----------------------+ +| count(distinct col3) | ++-----------------------+ +| 46 | ++-----------------------+ +Fetched 1 row(s) in 19.12s + +select count(distinct col4) from sample_data; ++----------------------+ +| count(distinct col4) | ++----------------------+ +| 147135880 | ++----------------------+ +Fetched 1 row(s) in 266.95s +</codeblock> + </conbody> +</concept>
