http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_explain.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_explain.xml b/docs/topics/impala_explain.xml index c9e8846..81cc17b 100644 --- a/docs/topics/impala_explain.xml +++ b/docs/topics/impala_explain.xml @@ -3,7 +3,7 @@ <concept id="explain"> <title>EXPLAIN Statement</title> - <titlealts><navtitle>EXPLAIN</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>EXPLAIN</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -13,6 +13,9 @@ <data name="Category" value="Planning"/> <data name="Category" value="Performance"/> <data name="Category" value="Troubleshooting"/> + <data name="Category" value="Administrators"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> @@ -96,11 +99,12 @@ <p rev="1.2"> When extended <codeph>EXPLAIN</codeph> output is enabled, <codeph>EXPLAIN</codeph> statements print - information about estimated memory requirements, minimum number of virtual cores, and so on that you can use - to fine-tune the resource management options explained in - <xref href="impala_resource_management.xml#rm_options"/>. (The estimated memory requirements are - intentionally on the high side, to allow a margin for error, to avoid cancelling a query unnecessarily if you - set the <codeph>MEM_LIMIT</codeph> option to the estimated memory figure.) + information about estimated memory requirements, minimum number of virtual cores, and so on. + <!-- + that you can use to fine-tune the resource management options explained in <xref href="impala_resource_management.xml#rm_options"/>. + (The estimated memory requirements are intentionally on the high side, to allow a margin for error, + to avoid cancelling a query unnecessarily if you set the <codeph>MEM_LIMIT</codeph> option to the estimated memory figure.) + --> </p> <p> @@ -145,9 +149,9 @@ statement has additional information to use in deciding how to optimize the distributed query. </p> - <draft-comment translate="no"> -Re-run these examples with more substantial tables populated with data. -</draft-comment> + <!-- To do: + Re-run these examples with more substantial tables populated with data. + --> <codeblock rev="1.2">[localhost:21000] > set explain_level=extended; EXPLAIN_LEVEL set to extended
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_explain_level.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_explain_level.xml b/docs/topics/impala_explain_level.xml index f54e8a8..e0c30d2 100644 --- a/docs/topics/impala_explain_level.xml +++ b/docs/topics/impala_explain_level.xml @@ -3,6 +3,7 @@ <concept rev="1.2" id="explain_level"> <title>EXPLAIN_LEVEL Query Option</title> + <titlealts audience="PDF"><navtitle>EXPLAIN_LEVEL</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -11,6 +12,9 @@ <data name="Category" value="Querying"/> <data name="Category" value="Performance"/> <data name="Category" value="Reports"/> + <data name="Category" value="Administrators"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> @@ -126,80 +130,84 @@ <codeblock>[localhost:21000] > create table t1 (x int, s string); [localhost:21000] > set explain_level=1; [localhost:21000] > explain select count(*) from t1; -+------------------------------------------------------------------------------------+ -| Explain String | -+------------------------------------------------------------------------------------+ -| Estimated Per-Host Requirements: Memory=10.00MB VCores=1 | -| WARNING: The following tables are missing relevant table and/or column statistics. | -| explain_plan.t1 | -| | -| 03:AGGREGATE [MERGE FINALIZE] | -| | output: sum(count(*)) | -| | | -| 02:EXCHANGE [PARTITION=UNPARTITIONED] | -| | | -| 01:AGGREGATE | -| | output: count(*) | -| | | -| 00:SCAN HDFS [explain_plan.t1] | -| partitions=1/1 size=0B | -+------------------------------------------------------------------------------------+ ++------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=10.00MB VCores=1 | +| WARNING: The following tables are missing relevant table and/or column | +| statistics. | +| explain_plan.t1 | +| | +| 03:AGGREGATE [MERGE FINALIZE] | +| | output: sum(count(*)) | +| | | +| 02:EXCHANGE [PARTITION=UNPARTITIONED] | +| | | +| 01:AGGREGATE | +| | output: count(*) | +| | | +| 00:SCAN HDFS [explain_plan.t1] | +| partitions=1/1 size=0B | ++------------------------------------------------------------------------+ [localhost:21000] > explain select * from t1; -+------------------------------------------------------------------------------------+ -| Explain String | -+------------------------------------------------------------------------------------+ -| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | -| WARNING: The following tables are missing relevant table and/or column statistics. | -| explain_plan.t1 | -| | -| 01:EXCHANGE [PARTITION=UNPARTITIONED] | -| | | -| 00:SCAN HDFS [explain_plan.t1] | -| partitions=1/1 size=0B | -+------------------------------------------------------------------------------------+ ++------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | +| WARNING: The following tables are missing relevant table and/or column | +| statistics. | +| explain_plan.t1 | +| | +| 01:EXCHANGE [PARTITION=UNPARTITIONED] | +| | | +| 00:SCAN HDFS [explain_plan.t1] | +| partitions=1/1 size=0B | ++------------------------------------------------------------------------+ [localhost:21000] > set explain_level=2; [localhost:21000] > explain select * from t1; -+------------------------------------------------------------------------------------+ -| Explain String | -+------------------------------------------------------------------------------------+ -| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | -| WARNING: The following tables are missing relevant table and/or column statistics. | -| explain_plan.t1 | -| | -| 01:EXCHANGE [PARTITION=UNPARTITIONED] | -| | hosts=0 per-host-mem=unavailable | -| | tuple-ids=0 row-size=19B cardinality=unavailable | -| | | -| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] | -| partitions=1/1 size=0B | -| table stats: unavailable | -| column stats: unavailable | -| hosts=0 per-host-mem=0B | -| tuple-ids=0 row-size=19B cardinality=unavailable | -+------------------------------------------------------------------------------------+ ++------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | +| WARNING: The following tables are missing relevant table and/or column | +| statistics. | +| explain_plan.t1 | +| | +| 01:EXCHANGE [PARTITION=UNPARTITIONED] | +| | hosts=0 per-host-mem=unavailable | +| | tuple-ids=0 row-size=19B cardinality=unavailable | +| | | +| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] | +| partitions=1/1 size=0B | +| table stats: unavailable | +| column stats: unavailable | +| hosts=0 per-host-mem=0B | +| tuple-ids=0 row-size=19B cardinality=unavailable | ++------------------------------------------------------------------------+ [localhost:21000] > set explain_level=3; [localhost:21000] > explain select * from t1; -+------------------------------------------------------------------------------------+ -| Explain String | -+------------------------------------------------------------------------------------+ -| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | -<b>| WARNING: The following tables are missing relevant table and/or column statistics. |</b> -<b>| explain_plan.t1 |</b> -| | -| F01:PLAN FRAGMENT [PARTITION=UNPARTITIONED] | -| 01:EXCHANGE [PARTITION=UNPARTITIONED] | -| hosts=0 per-host-mem=unavailable | -| tuple-ids=0 row-size=19B cardinality=unavailable | -| | -| F00:PLAN FRAGMENT [PARTITION=RANDOM] | -| DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, PARTITION=UNPARTITIONED] | -| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] | -| partitions=1/1 size=0B | -<b>| table stats: unavailable |</b> -<b>| column stats: unavailable |</b> -| hosts=0 per-host-mem=0B | -| tuple-ids=0 row-size=19B cardinality=unavailable | -+------------------------------------------------------------------------------------+ ++------------------------------------------------------------------------+ +| Explain String | ++------------------------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=-9223372036854775808B VCores=0 | +<b>| WARNING: The following tables are missing relevant table and/or column |</b> +<b>| statistics. |</b> +<b>| explain_plan.t1 |</b> +| | +| F01:PLAN FRAGMENT [PARTITION=UNPARTITIONED] | +| 01:EXCHANGE [PARTITION=UNPARTITIONED] | +| hosts=0 per-host-mem=unavailable | +| tuple-ids=0 row-size=19B cardinality=unavailable | +| | +| F00:PLAN FRAGMENT [PARTITION=RANDOM] | +| DATASTREAM SINK [FRAGMENT=F01, EXCHANGE=01, PARTITION=UNPARTITIONED] | +| 00:SCAN HDFS [explain_plan.t1, PARTITION=RANDOM] | +| partitions=1/1 size=0B | +<b>| table stats: unavailable |</b> +<b>| column stats: unavailable |</b> +| hosts=0 per-host-mem=0B | +| tuple-ids=0 row-size=19B cardinality=unavailable | ++------------------------------------------------------------------------+ </codeblock> <p> @@ -246,61 +254,63 @@ <codeblock>[localhost:21000] > set explain_level=1; [localhost:21000] > explain select one.*, two.*, three.* from t1 one, t1 two, t1 three where one.x = two.x and two.x = three.x; -+------------------------------------------------------------------------------------+ -| Explain String | -+------------------------------------------------------------------------------------+ -| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 | -| | -| 07:EXCHANGE [PARTITION=UNPARTITIONED] | -| | | -<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b> -| | hash predicates: two.x = three.x | -| | | -<b>| |--06:EXCHANGE [BROADCAST] |</b> -| | | | -| | 02:SCAN HDFS [explain_plan.t1 three] | -| | partitions=1/1 size=0B | -| | | -<b>| 03:HASH JOIN [INNER JOIN, BROADCAST] |</b> -| | hash predicates: one.x = two.x | -| | | -<b>| |--05:EXCHANGE [BROADCAST] |</b> -| | | | -| | 01:SCAN HDFS [explain_plan.t1 two] | -| | partitions=1/1 size=0B | -| | | -| 00:SCAN HDFS [explain_plan.t1 one] | -| partitions=1/1 size=0B | -+------------------------------------------------------------------------------------+ -[localhost:21000] > explain select one.*, two.*, three.* from t1 one join [shuffle] t1 two join t1 three where one.x = two.x and two.x = three.x; -+------------------------------------------------------------------------------------+ -| Explain String | -+------------------------------------------------------------------------------------+ -| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 | -| | -| 08:EXCHANGE [PARTITION=UNPARTITIONED] | -| | | -<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b> -| | hash predicates: two.x = three.x | -| | | -<b>| |--07:EXCHANGE [BROADCAST] |</b> -| | | | -| | 02:SCAN HDFS [explain_plan.t1 three] | -| | partitions=1/1 size=0B | -| | | -<b>| 03:HASH JOIN [INNER JOIN, PARTITIONED] |</b> -| | hash predicates: one.x = two.x | -| | | -<b>| |--06:EXCHANGE [PARTITION=HASH(two.x)] |</b> -| | | | -| | 01:SCAN HDFS [explain_plan.t1 two] | -| | partitions=1/1 size=0B | -| | | -<b>| 05:EXCHANGE [PARTITION=HASH(one.x)] |</b> -| | | -| 00:SCAN HDFS [explain_plan.t1 one] | -| partitions=1/1 size=0B | -+------------------------------------------------------------------------------------+ ++---------------------------------------------------------+ +| Explain String | ++---------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 | +| | +| 07:EXCHANGE [PARTITION=UNPARTITIONED] | +| | | +<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b> +| | hash predicates: two.x = three.x | +| | | +<b>| |--06:EXCHANGE [BROADCAST] |</b> +| | | | +| | 02:SCAN HDFS [explain_plan.t1 three] | +| | partitions=1/1 size=0B | +| | | +<b>| 03:HASH JOIN [INNER JOIN, BROADCAST] |</b> +| | hash predicates: one.x = two.x | +| | | +<b>| |--05:EXCHANGE [BROADCAST] |</b> +| | | | +| | 01:SCAN HDFS [explain_plan.t1 two] | +| | partitions=1/1 size=0B | +| | | +| 00:SCAN HDFS [explain_plan.t1 one] | +| partitions=1/1 size=0B | ++---------------------------------------------------------+ +[localhost:21000] > explain select one.*, two.*, three.* + > from t1 one join [shuffle] t1 two join t1 three + > where one.x = two.x and two.x = three.x; ++---------------------------------------------------------+ +| Explain String | ++---------------------------------------------------------+ +| Estimated Per-Host Requirements: Memory=4.00GB VCores=3 | +| | +| 08:EXCHANGE [PARTITION=UNPARTITIONED] | +| | | +<b>| 04:HASH JOIN [INNER JOIN, BROADCAST] |</b> +| | hash predicates: two.x = three.x | +| | | +<b>| |--07:EXCHANGE [BROADCAST] |</b> +| | | | +| | 02:SCAN HDFS [explain_plan.t1 three] | +| | partitions=1/1 size=0B | +| | | +<b>| 03:HASH JOIN [INNER JOIN, PARTITIONED] |</b> +| | hash predicates: one.x = two.x | +| | | +<b>| |--06:EXCHANGE [PARTITION=HASH(two.x)] |</b> +| | | | +| | 01:SCAN HDFS [explain_plan.t1 two] | +| | partitions=1/1 size=0B | +| | | +<b>| 05:EXCHANGE [PARTITION=HASH(one.x)] |</b> +| | | +| 00:SCAN HDFS [explain_plan.t1 one] | +| partitions=1/1 size=0B | ++---------------------------------------------------------+ </codeblock> <p> @@ -314,7 +324,9 @@ </p> <codeblock>[localhost:21000] > set explain_level=0; -[localhost:21000] > explain select one.*, two.*, three.* from t1 one join [shuffle] t1 two join t1 three where one.x = two.x and two.x = three.x; +[localhost:21000] > explain select one.*, two.*, three.* + > from t1 one join [shuffle] t1 two join t1 three + > where one.x = two.x and two.x = three.x; +---------------------------------------------------------+ | Explain String | +---------------------------------------------------------+ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_explain_plan.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_explain_plan.xml b/docs/topics/impala_explain_plan.xml index 44c8b74..4fd721f 100644 --- a/docs/topics/impala_explain_plan.xml +++ b/docs/topics/impala_explain_plan.xml @@ -4,7 +4,19 @@ <title>Understanding Impala Query Performance - EXPLAIN Plans and Query Profiles</title> <titlealts audience="PDF"><navtitle>EXPLAIN Plans and Query Profiles</navtitle></titlealts> - + <prolog> + <metadata> + <data name="Category" value="Performance"/> + <data name="Category" value="Impala"/> + <data name="Category" value="Querying"/> + <data name="Category" value="Troubleshooting"/> + <data name="Category" value="Reports"/> + <data name="Category" value="Concepts"/> + <data name="Category" value="Administrators"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> <conbody> @@ -14,7 +26,543 @@ actually running the query itself. </p> - + <p rev="1.4.0"> + For an overview of the physical performance characteristics for a query, issue the <codeph>SUMMARY</codeph> + statement in <cmdname>impala-shell</cmdname> immediately after executing a query. This condensed information + shows which phases of execution took the most time, and how the estimates for memory usage and number of rows + at each phase compare to the actual values. + </p> + + <p> + To understand the detailed performance characteristics for a query, issue the <codeph>PROFILE</codeph> + statement in <cmdname>impala-shell</cmdname> immediately after executing a query. This low-level information + includes physical details about memory, CPU, I/O, and network usage, and thus is only available after the + query is actually run. + </p> + + <p outputclass="toc inpage"/> + + <p> + Also, see <xref href="impala_hbase.xml#hbase_performance"/> + and <xref href="impala_s3.xml#s3_performance"/> + for examples of interpreting + <codeph>EXPLAIN</codeph> plans for queries against HBase tables + <ph rev="2.2.0">and data stored in the Amazon Simple Storage System (S3)</ph>. + </p> + </conbody> + + <concept id="perf_explain"> + + <title>Using the EXPLAIN Plan for Performance Tuning</title> + + <conbody> + + <p> + The <codeph><xref href="impala_explain.xml#explain">EXPLAIN</xref></codeph> statement gives you an outline + of the logical steps that a query will perform, such as how the work will be distributed among the nodes + and how intermediate results will be combined to produce the final result set. You can see these details + before actually running the query. You can use this information to check that the query will not operate in + some very unexpected or inefficient way. + </p> + +<!-- Turn into a conref in ciiu_langref too. Relocate to common.xml. --> + +<codeblock conref="impala_explain.xml#explain/explain_plan_simple"/> + + <p conref="../shared/impala_common.xml#common/explain_interpret"/> + + <p> + The <codeph>EXPLAIN</codeph> plan is also printed at the beginning of the query profile report described in + <xref href="#perf_profile"/>, for convenience in examining both the logical and physical aspects of the + query side-by-side. + </p> + + <p rev="1.2"> + The amount of detail displayed in the <codeph>EXPLAIN</codeph> output is controlled by the + <xref href="impala_explain_level.xml#explain_level">EXPLAIN_LEVEL</xref> query option. You typically + increase this setting from <codeph>normal</codeph> to <codeph>verbose</codeph> (or from <codeph>0</codeph> + to <codeph>1</codeph>) when doublechecking the presence of table and column statistics during performance + tuning, or when estimating query resource usage in conjunction with the resource management features in CDH + 5. + </p> + + <!-- To do: + This is a good place to have a few examples. + --> </conbody> </concept> + <concept id="perf_summary"> + + <title>Using the SUMMARY Report for Performance Tuning</title> + + <conbody> + + <p> + The <codeph><xref href="impala_shell_commands.xml#shell_commands">SUMMARY</xref></codeph> command within + the <cmdname>impala-shell</cmdname> interpreter gives you an easy-to-digest overview of the timings for the + different phases of execution for a query. Like the <codeph>EXPLAIN</codeph> plan, it is easy to see + potential performance bottlenecks. Like the <codeph>PROFILE</codeph> output, it is available after the + query is run and so displays actual timing numbers. + </p> + + <p> + The <codeph>SUMMARY</codeph> report is also printed at the beginning of the query profile report described + in <xref href="#perf_profile"/>, for convenience in examining high-level and low-level aspects of the query + side-by-side. + </p> + + <p> + For example, here is a query involving an aggregate function, on a single-node VM. The different stages of + the query and their timings are shown (rolled up for all nodes), along with estimated and actual values + used in planning the query. In this case, the <codeph>AVG()</codeph> function is computed for a subset of + data on each node (stage 01) and then the aggregated results from all nodes are combined at the end (stage + 03). You can see which stages took the most time, and whether any estimates were substantially different + than the actual data distribution. (When examining the time values, be sure to consider the suffixes such + as <codeph>us</codeph> for microseconds and <codeph>ms</codeph> for milliseconds, rather than just looking + for the largest numbers.) + </p> + +<codeblock>[localhost:21000] > select avg(ss_sales_price) from store_sales where ss_coupon_amt = 0; ++---------------------+ +| avg(ss_sales_price) | ++---------------------+ +| 37.80770926328327 | ++---------------------+ +[localhost:21000] > summary; ++--------------+--------+----------+----------+-------+------------+----------+---------------+-----------------+ +| Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail | ++--------------+--------+----------+----------+-------+------------+----------+---------------+-----------------+ +| 03:AGGREGATE | 1 | 1.03ms | 1.03ms | 1 | 1 | 48.00 KB | -1 B | MERGE FINALIZE | +| 02:EXCHANGE | 1 | 0ns | 0ns | 1 | 1 | 0 B | -1 B | UNPARTITIONED | +| 01:AGGREGATE | 1 | 30.79ms | 30.79ms | 1 | 1 | 80.00 KB | 10.00 MB | | +| 00:SCAN HDFS | 1 | 5.45s | 5.45s | 2.21M | -1 | 64.05 MB | 432.00 MB | tpc.store_sales | ++--------------+--------+----------+----------+-------+------------+----------+---------------+-----------------+ +</codeblock> + + <p> + Notice how the longest initial phase of the query is measured in seconds (s), while later phases working on + smaller intermediate results are measured in milliseconds (ms) or even nanoseconds (ns). + </p> + + <p> + Here is an example from a more complicated query, as it would appear in the <codeph>PROFILE</codeph> + output: + </p> + +<!-- This example taken from: https://github.com/cloudera/Impala/commit/af85d3b518089b8840ddea4356947e40d1aca9bd --> + +<codeblock>Operator #Hosts Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail +------------------------------------------------------------------------------------------------------------------------ +09:MERGING-EXCHANGE 1 79.738us 79.738us 5 5 0 -1.00 B UNPARTITIONED +05:TOP-N 3 84.693us 88.810us 5 5 12.00 KB 120.00 B +04:AGGREGATE 3 5.263ms 6.432ms 5 5 44.00 KB 10.00 MB MERGE FINALIZE +08:AGGREGATE 3 16.659ms 27.444ms 52.52K 600.12K 3.20 MB 15.11 MB MERGE +07:EXCHANGE 3 2.644ms 5.1ms 52.52K 600.12K 0 0 HASH(o_orderpriority) +03:AGGREGATE 3 342.913ms 966.291ms 52.52K 600.12K 10.80 MB 15.11 MB +02:HASH JOIN 3 2s165ms 2s171ms 144.87K 600.12K 13.63 MB 941.01 KB INNER JOIN, BROADCAST +|--06:EXCHANGE 3 8.296ms 8.692ms 57.22K 15.00K 0 0 BROADCAST +| 01:SCAN HDFS 2 1s412ms 1s978ms 57.22K 15.00K 24.21 MB 176.00 MB tpch.orders o +00:SCAN HDFS 3 8s032ms 8s558ms 3.79M 600.12K 32.29 MB 264.00 MB tpch.lineitem l +</codeblock> + </conbody> + </concept> + + <concept id="perf_profile"> + + <title>Using the Query Profile for Performance Tuning</title> + + <conbody> + + <p> + The <codeph>PROFILE</codeph> statement, available in the <cmdname>impala-shell</cmdname> interpreter, + produces a detailed low-level report showing how the most recent query was executed. Unlike the + <codeph>EXPLAIN</codeph> plan described in <xref href="#perf_explain"/>, this information is only available + after the query has finished. It shows physical details such as the number of bytes read, maximum memory + usage, and so on for each node. You can use this information to determine if the query is I/O-bound or + CPU-bound, whether some network condition is imposing a bottleneck, whether a slowdown is affecting some + nodes but not others, and to check that recommended configuration settings such as short-circuit local + reads are in effect. + </p> + + <p rev="CDH-29157"> + By default, time values in the profile output reflect the wall-clock time taken by an operation. + For values denoting system time or user time, the measurement unit is reflected in the metric + name, such as <codeph>ScannerThreadsSysTime</codeph> or <codeph>ScannerThreadsUserTime</codeph>. + For example, a multi-threaded I/O operation might show a small figure for wall-clock time, + while the corresponding system time is larger, representing the sum of the CPU time taken by each thread. + Or a wall-clock time figure might be larger because it counts time spent waiting, while + the corresponding system and user time figures only measure the time while the operation + is actively using CPU cycles. + </p> + + <p> + The <xref href="impala_explain_plan.xml#perf_explain"><codeph>EXPLAIN</codeph> plan</xref> is also printed + at the beginning of the query profile report, for convenience in examining both the logical and physical + aspects of the query side-by-side. The + <xref href="impala_explain_level.xml#explain_level">EXPLAIN_LEVEL</xref> query option also controls the + verbosity of the <codeph>EXPLAIN</codeph> output printed by the <codeph>PROFILE</codeph> command. + </p> + + <!-- To do: + This is a good place to have a few more examples. + --> + + <p> + Here is an example of a query profile, from a relatively straightforward query on a single-node + pseudo-distributed cluster to keep the output relatively brief. + </p> + +<codeblock>[localhost:21000] > profile; +Query Runtime Profile: +Query (id=6540a03d4bee0691:4963d6269b210ebd): + Summary: + Session ID: ea4a197f1c7bf858:c74e66f72e3a33ba + Session Type: BEESWAX + Start Time: 2013-12-02 17:10:30.263067000 + End Time: 2013-12-02 17:10:50.932044000 + Query Type: QUERY + Query State: FINISHED + Query Status: OK + Impala Version: impalad version 1.2.1 RELEASE (build edb5af1bcad63d410bc5d47cc203df3a880e9324) + User: cloudera + Network Address: 127.0.0.1:49161 + Default Db: stats_testing + Sql Statement: select t1.s, t2.s from t1 join t2 on (t1.id = t2.parent) + Plan: +---------------- +Estimated Per-Host Requirements: Memory=2.09GB VCores=2 + +PLAN FRAGMENT 0 + PARTITION: UNPARTITIONED + + 4:EXCHANGE + cardinality: unavailable + per-host memory: unavailable + tuple ids: 0 1 + +PLAN FRAGMENT 1 + PARTITION: RANDOM + + STREAM DATA SINK + EXCHANGE ID: 4 + UNPARTITIONED + + 2:HASH JOIN + | join op: INNER JOIN (BROADCAST) + | hash predicates: + | t1.id = t2.parent + | cardinality: unavailable + | per-host memory: 2.00GB + | tuple ids: 0 1 + | + |----3:EXCHANGE + | cardinality: unavailable + | per-host memory: 0B + | tuple ids: 1 + | + 0:SCAN HDFS + table=stats_testing.t1 #partitions=1/1 size=33B + table stats: unavailable + column stats: unavailable + cardinality: unavailable + per-host memory: 32.00MB + tuple ids: 0 + +PLAN FRAGMENT 2 + PARTITION: RANDOM + + STREAM DATA SINK + EXCHANGE ID: 3 + UNPARTITIONED + + 1:SCAN HDFS + table=stats_testing.t2 #partitions=1/1 size=960.00KB + table stats: unavailable + column stats: unavailable + cardinality: unavailable + per-host memory: 96.00MB + tuple ids: 1 +---------------- + Query Timeline: 20s670ms + - Start execution: 2.559ms (2.559ms) + - Planning finished: 23.587ms (21.27ms) + - Rows available: 666.199ms (642.612ms) + - First row fetched: 668.919ms (2.719ms) + - Unregister query: 20s668ms (20s000ms) + ImpalaServer: + - ClientFetchWaitTimer: 19s637ms + - RowMaterializationTimer: 167.121ms + Execution Profile 6540a03d4bee0691:4963d6269b210ebd:(Active: 837.815ms, % non-child: 0.00%) + Per Node Peak Memory Usage: impala-1.example.com:22000(7.42 MB) + - FinalizationTimer: 0ns + Coordinator Fragment:(Active: 195.198ms, % non-child: 0.00%) + MemoryUsage(500.0ms): 16.00 KB, 7.42 MB, 7.33 MB, 7.10 MB, 6.94 MB, 6.71 MB, 6.56 MB, 6.40 MB, 6.17 MB, 6.02 MB, 5.79 MB, 5.63 MB, 5.48 MB, 5.25 MB, 5.09 MB, 4.86 MB, 4.71 MB, 4.47 MB, 4.32 MB, 4.09 MB, 3.93 MB, 3.78 MB, 3.55 MB, 3.39 MB, 3.16 MB, 3.01 MB, 2.78 MB, 2.62 MB, 2.39 MB, 2.24 MB, 2.08 MB, 1.85 MB, 1.70 MB, 1.54 MB, 1.31 MB, 1.16 MB, 948.00 KB, 790.00 KB, 553.00 KB, 395.00 KB, 237.00 KB + ThreadUsage(500.0ms): 1 + - AverageThreadTokens: 1.00 + - PeakMemoryUsage: 7.42 MB + - PrepareTime: 36.144us + - RowsProduced: 98.30K (98304) + - TotalCpuTime: 20s449ms + - TotalNetworkWaitTime: 191.630ms + - TotalStorageWaitTime: 0ns + CodeGen:(Active: 150.679ms, % non-child: 77.19%) + - CodegenTime: 0ns + - CompileTime: 139.503ms + - LoadTime: 10.7ms + - ModuleFileSize: 95.27 KB + EXCHANGE_NODE (id=4):(Active: 194.858ms, % non-child: 99.83%) + - BytesReceived: 2.33 MB + - ConvertRowBatchTime: 2.732ms + - DataArrivalWaitTime: 191.118ms + - DeserializeRowBatchTimer: 14.943ms + - FirstBatchArrivalWaitTime: 191.117ms + - PeakMemoryUsage: 7.41 MB + - RowsReturned: 98.30K (98304) + - RowsReturnedRate: 504.49 K/sec + - SendersBlockedTimer: 0ns + - SendersBlockedTotalTimer(*): 0ns + Averaged Fragment 1:(Active: 442.360ms, % non-child: 0.00%) + split sizes: min: 33.00 B, max: 33.00 B, avg: 33.00 B, stddev: 0.00 + completion times: min:443.720ms max:443.720ms mean: 443.720ms stddev:0ns + execution rates: min:74.00 B/sec max:74.00 B/sec mean:74.00 B/sec stddev:0.00 /sec + num instances: 1 + - AverageThreadTokens: 1.00 + - PeakMemoryUsage: 6.06 MB + - PrepareTime: 7.291ms + - RowsProduced: 98.30K (98304) + - TotalCpuTime: 784.259ms + - TotalNetworkWaitTime: 388.818ms + - TotalStorageWaitTime: 3.934ms + CodeGen:(Active: 312.862ms, % non-child: 70.73%) + - CodegenTime: 2.669ms + - CompileTime: 302.467ms + - LoadTime: 9.231ms + - ModuleFileSize: 95.27 KB + DataStreamSender (dst_id=4):(Active: 80.63ms, % non-child: 18.10%) + - BytesSent: 2.33 MB + - NetworkThroughput(*): 35.89 MB/sec + - OverallThroughput: 29.06 MB/sec + - PeakMemoryUsage: 5.33 KB + - SerializeBatchTime: 26.487ms + - ThriftTransmitTime(*): 64.814ms + - UncompressedRowBatchSize: 6.66 MB + HASH_JOIN_NODE (id=2):(Active: 362.25ms, % non-child: 3.92%) + - BuildBuckets: 1.02K (1024) + - BuildRows: 98.30K (98304) + - BuildTime: 12.622ms + - LoadFactor: 0.00 + - PeakMemoryUsage: 6.02 MB + - ProbeRows: 3 + - ProbeTime: 3.579ms + - RowsReturned: 98.30K (98304) + - RowsReturnedRate: 271.54 K/sec + EXCHANGE_NODE (id=3):(Active: 344.680ms, % non-child: 77.92%) + - BytesReceived: 1.15 MB + - ConvertRowBatchTime: 2.792ms + - DataArrivalWaitTime: 339.936ms + - DeserializeRowBatchTimer: 9.910ms + - FirstBatchArrivalWaitTime: 199.474ms + - PeakMemoryUsage: 156.00 KB + - RowsReturned: 98.30K (98304) + - RowsReturnedRate: 285.20 K/sec + - SendersBlockedTimer: 0ns + - SendersBlockedTotalTimer(*): 0ns + HDFS_SCAN_NODE (id=0):(Active: 13.616us, % non-child: 0.00%) + - AverageHdfsReadThreadConcurrency: 0.00 + - AverageScannerThreadConcurrency: 0.00 + - BytesRead: 33.00 B + - BytesReadLocal: 33.00 B + - BytesReadShortCircuit: 33.00 B + - NumDisksAccessed: 1 + - NumScannerThreadsStarted: 1 + - PeakMemoryUsage: 46.00 KB + - PerReadThreadRawHdfsThroughput: 287.52 KB/sec + - RowsRead: 3 + - RowsReturned: 3 + - RowsReturnedRate: 220.33 K/sec + - ScanRangesComplete: 1 + - ScannerThreadsInvoluntaryContextSwitches: 26 + - ScannerThreadsTotalWallClockTime: 55.199ms + - DelimiterParseTime: 2.463us + - MaterializeTupleTime(*): 1.226us + - ScannerThreadsSysTime: 0ns + - ScannerThreadsUserTime: 42.993ms + - ScannerThreadsVoluntaryContextSwitches: 1 + - TotalRawHdfsReadTime(*): 112.86us + - TotalReadThroughput: 0.00 /sec + Averaged Fragment 2:(Active: 190.120ms, % non-child: 0.00%) + split sizes: min: 960.00 KB, max: 960.00 KB, avg: 960.00 KB, stddev: 0.00 + completion times: min:191.736ms max:191.736ms mean: 191.736ms stddev:0ns + execution rates: min:4.89 MB/sec max:4.89 MB/sec mean:4.89 MB/sec stddev:0.00 /sec + num instances: 1 + - AverageThreadTokens: 0.00 + - PeakMemoryUsage: 906.33 KB + - PrepareTime: 3.67ms + - RowsProduced: 98.30K (98304) + - TotalCpuTime: 403.351ms + - TotalNetworkWaitTime: 34.999ms + - TotalStorageWaitTime: 108.675ms + CodeGen:(Active: 162.57ms, % non-child: 85.24%) + - CodegenTime: 3.133ms + - CompileTime: 148.316ms + - LoadTime: 12.317ms + - ModuleFileSize: 95.27 KB + DataStreamSender (dst_id=3):(Active: 70.620ms, % non-child: 37.14%) + - BytesSent: 1.15 MB + - NetworkThroughput(*): 23.30 MB/sec + - OverallThroughput: 16.23 MB/sec + - PeakMemoryUsage: 5.33 KB + - SerializeBatchTime: 22.69ms + - ThriftTransmitTime(*): 49.178ms + - UncompressedRowBatchSize: 3.28 MB + HDFS_SCAN_NODE (id=1):(Active: 118.839ms, % non-child: 62.51%) + - AverageHdfsReadThreadConcurrency: 0.00 + - AverageScannerThreadConcurrency: 0.00 + - BytesRead: 960.00 KB + - BytesReadLocal: 960.00 KB + - BytesReadShortCircuit: 960.00 KB + - NumDisksAccessed: 1 + - NumScannerThreadsStarted: 1 + - PeakMemoryUsage: 869.00 KB + - PerReadThreadRawHdfsThroughput: 130.21 MB/sec + - RowsRead: 98.30K (98304) + - RowsReturned: 98.30K (98304) + - RowsReturnedRate: 827.20 K/sec + - ScanRangesComplete: 15 + - ScannerThreadsInvoluntaryContextSwitches: 34 + - ScannerThreadsTotalWallClockTime: 189.774ms + - DelimiterParseTime: 15.703ms + - MaterializeTupleTime(*): 3.419ms + - ScannerThreadsSysTime: 1.999ms + - ScannerThreadsUserTime: 44.993ms + - ScannerThreadsVoluntaryContextSwitches: 118 + - TotalRawHdfsReadTime(*): 7.199ms + - TotalReadThroughput: 0.00 /sec + Fragment 1: + Instance 6540a03d4bee0691:4963d6269b210ebf (host=impala-1.example.com:22000):(Active: 442.360ms, % non-child: 0.00%) + Hdfs split stats (<volume id>:<# splits>/<split lengths>): 0:1/33.00 B + MemoryUsage(500.0ms): 69.33 KB + ThreadUsage(500.0ms): 1 + - AverageThreadTokens: 1.00 + - PeakMemoryUsage: 6.06 MB + - PrepareTime: 7.291ms + - RowsProduced: 98.30K (98304) + - TotalCpuTime: 784.259ms + - TotalNetworkWaitTime: 388.818ms + - TotalStorageWaitTime: 3.934ms + CodeGen:(Active: 312.862ms, % non-child: 70.73%) + - CodegenTime: 2.669ms + - CompileTime: 302.467ms + - LoadTime: 9.231ms + - ModuleFileSize: 95.27 KB + DataStreamSender (dst_id=4):(Active: 80.63ms, % non-child: 18.10%) + - BytesSent: 2.33 MB + - NetworkThroughput(*): 35.89 MB/sec + - OverallThroughput: 29.06 MB/sec + - PeakMemoryUsage: 5.33 KB + - SerializeBatchTime: 26.487ms + - ThriftTransmitTime(*): 64.814ms + - UncompressedRowBatchSize: 6.66 MB + HASH_JOIN_NODE (id=2):(Active: 362.25ms, % non-child: 3.92%) + ExecOption: Build Side Codegen Enabled, Probe Side Codegen Enabled, Hash Table Built Asynchronously + - BuildBuckets: 1.02K (1024) + - BuildRows: 98.30K (98304) + - BuildTime: 12.622ms + - LoadFactor: 0.00 + - PeakMemoryUsage: 6.02 MB + - ProbeRows: 3 + - ProbeTime: 3.579ms + - RowsReturned: 98.30K (98304) + - RowsReturnedRate: 271.54 K/sec + EXCHANGE_NODE (id=3):(Active: 344.680ms, % non-child: 77.92%) + - BytesReceived: 1.15 MB + - ConvertRowBatchTime: 2.792ms + - DataArrivalWaitTime: 339.936ms + - DeserializeRowBatchTimer: 9.910ms + - FirstBatchArrivalWaitTime: 199.474ms + - PeakMemoryUsage: 156.00 KB + - RowsReturned: 98.30K (98304) + - RowsReturnedRate: 285.20 K/sec + - SendersBlockedTimer: 0ns + - SendersBlockedTotalTimer(*): 0ns + HDFS_SCAN_NODE (id=0):(Active: 13.616us, % non-child: 0.00%) + Hdfs split stats (<volume id>:<# splits>/<split lengths>): 0:1/33.00 B + Hdfs Read Thread Concurrency Bucket: 0:0% 1:0% + File Formats: TEXT/NONE:1 + ExecOption: Codegen enabled: 1 out of 1 + - AverageHdfsReadThreadConcurrency: 0.00 + - AverageScannerThreadConcurrency: 0.00 + - BytesRead: 33.00 B + - BytesReadLocal: 33.00 B + - BytesReadShortCircuit: 33.00 B + - NumDisksAccessed: 1 + - NumScannerThreadsStarted: 1 + - PeakMemoryUsage: 46.00 KB + - PerReadThreadRawHdfsThroughput: 287.52 KB/sec + - RowsRead: 3 + - RowsReturned: 3 + - RowsReturnedRate: 220.33 K/sec + - ScanRangesComplete: 1 + - ScannerThreadsInvoluntaryContextSwitches: 26 + - ScannerThreadsTotalWallClockTime: 55.199ms + - DelimiterParseTime: 2.463us + - MaterializeTupleTime(*): 1.226us + - ScannerThreadsSysTime: 0ns + - ScannerThreadsUserTime: 42.993ms + - ScannerThreadsVoluntaryContextSwitches: 1 + - TotalRawHdfsReadTime(*): 112.86us + - TotalReadThroughput: 0.00 /sec + Fragment 2: + Instance 6540a03d4bee0691:4963d6269b210ec0 (host=impala-1.example.com:22000):(Active: 190.120ms, % non-child: 0.00%) + Hdfs split stats (<volume id>:<# splits>/<split lengths>): 0:15/960.00 KB + - AverageThreadTokens: 0.00 + - PeakMemoryUsage: 906.33 KB + - PrepareTime: 3.67ms + - RowsProduced: 98.30K (98304) + - TotalCpuTime: 403.351ms + - TotalNetworkWaitTime: 34.999ms + - TotalStorageWaitTime: 108.675ms + CodeGen:(Active: 162.57ms, % non-child: 85.24%) + - CodegenTime: 3.133ms + - CompileTime: 148.316ms + - LoadTime: 12.317ms + - ModuleFileSize: 95.27 KB + DataStreamSender (dst_id=3):(Active: 70.620ms, % non-child: 37.14%) + - BytesSent: 1.15 MB + - NetworkThroughput(*): 23.30 MB/sec + - OverallThroughput: 16.23 MB/sec + - PeakMemoryUsage: 5.33 KB + - SerializeBatchTime: 22.69ms + - ThriftTransmitTime(*): 49.178ms + - UncompressedRowBatchSize: 3.28 MB + HDFS_SCAN_NODE (id=1):(Active: 118.839ms, % non-child: 62.51%) + Hdfs split stats (<volume id>:<# splits>/<split lengths>): 0:15/960.00 KB + Hdfs Read Thread Concurrency Bucket: 0:0% 1:0% + File Formats: TEXT/NONE:15 + ExecOption: Codegen enabled: 15 out of 15 + - AverageHdfsReadThreadConcurrency: 0.00 + - AverageScannerThreadConcurrency: 0.00 + - BytesRead: 960.00 KB + - BytesReadLocal: 960.00 KB + - BytesReadShortCircuit: 960.00 KB + - NumDisksAccessed: 1 + - NumScannerThreadsStarted: 1 + - PeakMemoryUsage: 869.00 KB + - PerReadThreadRawHdfsThroughput: 130.21 MB/sec + - RowsRead: 98.30K (98304) + - RowsReturned: 98.30K (98304) + - RowsReturnedRate: 827.20 K/sec + - ScanRangesComplete: 15 + - ScannerThreadsInvoluntaryContextSwitches: 34 + - ScannerThreadsTotalWallClockTime: 189.774ms + - DelimiterParseTime: 15.703ms + - MaterializeTupleTime(*): 3.419ms + - ScannerThreadsSysTime: 1.999ms + - ScannerThreadsUserTime: 44.993ms + - ScannerThreadsVoluntaryContextSwitches: 118 + - TotalRawHdfsReadTime(*): 7.199ms + - TotalReadThroughput: 0.00 /sec</codeblock> + </conbody> + </concept> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_float.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_float.xml b/docs/topics/impala_float.xml index 51e3311..8ef1144 100644 --- a/docs/topics/impala_float.xml +++ b/docs/topics/impala_float.xml @@ -3,7 +3,7 @@ <concept id="float"> <title>FLOAT Data Type</title> - <titlealts><navtitle>FLOAT</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>FLOAT</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -68,11 +68,11 @@ SELECT CAST(1000.5 AS FLOAT); <p conref="../shared/impala_common.xml#common/text_bulky"/> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/compatibility_blurb"/> --> +<!-- <p conref="../shared/impala_common.xml#common/compatibility_blurb"/> --> <p conref="../shared/impala_common.xml#common/internals_4_bytes"/> -<!-- <p conref="/Content/impala_common_xi44078.xml#common/added_in_20"/> --> +<!-- <p conref="../shared/impala_common.xml#common/added_in_20"/> --> <p conref="../shared/impala_common.xml#common/column_stats_constant"/> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_functions.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_functions.xml b/docs/topics/impala_functions.xml index 527744b..55a36dc 100644 --- a/docs/topics/impala_functions.xml +++ b/docs/topics/impala_functions.xml @@ -3,7 +3,7 @@ <concept id="builtins"> <title id="title_functions">Impala Built-In Functions</title> - <titlealts><navtitle>Built-In Functions</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>Built-In Functions</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -17,9 +17,9 @@ <conbody> - <draft-comment translate="no"> -Opportunity to conref some material between here and the "Functions" topic under "Schema Objects". -</draft-comment> + <!-- To do: + Opportunity to conref some material between here and the "Functions" topic under "Schema Objects". + --> <p> Impala supports several categories of built-in functions. These functions let you perform mathematical @@ -152,7 +152,7 @@ select max(height), avg(height) from census_data where age > 20; <p rev="2.0.0"> Analytic functions are a variation on aggregate functions. Instead of returning a single value, or an identical value for each group of rows, they can compute values that vary based on a <q>window</q> consisting - of of other rows around them in the result set. + of other rows around them in the result set. </p> <p outputclass="toc"/> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_functions_overview.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_functions_overview.xml b/docs/topics/impala_functions_overview.xml index 26a4d35..0e3973b 100644 --- a/docs/topics/impala_functions_overview.xml +++ b/docs/topics/impala_functions_overview.xml @@ -2,7 +2,7 @@ <concept id="functions"> <title>Overview of Impala Functions</title> - <titlealts><navtitle>Functions</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>Functions</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_grant.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_grant.xml b/docs/topics/impala_grant.xml index ddbd39c..ca45c0a 100644 --- a/docs/topics/impala_grant.xml +++ b/docs/topics/impala_grant.xml @@ -3,21 +3,25 @@ <concept rev="2.0.0" id="grant"> <title>GRANT Statement (CDH 5.2 or higher only)</title> - <titlealts><navtitle>GRANT (CDH 5.2 or higher only)</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>GRANT</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> <data name="Category" value="DDL"/> <data name="Category" value="SQL"/> + <data name="Category" value="Security"/> <data name="Category" value="Sentry"/> <data name="Category" value="Roles"/> + <data name="Category" value="Administrators"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> <!-- Consider whether to go deeper into categories like Security for the Sentry-related statements. --> </metadata> </prolog> <conbody> - <p> + <p rev="2.0.0"> <indexterm audience="Cloudera">GRANT statement</indexterm> <!-- Copied from Sentry docs. Turn into conref. I did some rewording for clarity. --> The <codeph>GRANT</codeph> statement grants roles or privileges on specified objects to groups. Only Sentry @@ -69,8 +73,8 @@ object_type ::= TABLE | DATABASE | SERVER | URI <p rev="2.3.0 collevelauth"> The ability to grant or revoke <codeph>SELECT</codeph> privilege on specific columns is available - in CDH 5.5 / Impala 2.3 and higher. <!--See <xref href="sg_hive_sql.xml#concept_c2q_4qx_p4/col_level_auth_sentry"/> - for details.--> + in CDH 5.5 / Impala 2.3 and higher. See <xref href="sg_hive_sql.xml#concept_c2q_4qx_p4/col_level_auth_sentry"/> + for details. </p> <!-- Turn compatibility info into a conref or series of conrefs. (In both GRANT and REVOKE.) --> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_group_by.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_group_by.xml b/docs/topics/impala_group_by.xml index 10b7de4..055ae2a 100644 --- a/docs/topics/impala_group_by.xml +++ b/docs/topics/impala_group_by.xml @@ -9,6 +9,8 @@ <data name="Category" value="SQL"/> <data name="Category" value="Querying"/> <data name="Category" value="Aggregate Functions"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_group_concat.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_group_concat.xml b/docs/topics/impala_group_concat.xml index b2a7ff6..0971875 100644 --- a/docs/topics/impala_group_concat.xml +++ b/docs/topics/impala_group_concat.xml @@ -3,7 +3,7 @@ <concept rev="1.2" id="group_concat"> <title>GROUP_CONCAT Function</title> - <titlealts><navtitle>GROUP_CONCAT</navtitle></titlealts> + <titlealts audience="PDF"><navtitle>GROUP_CONCAT</navtitle></titlealts> <prolog> <metadata> <data name="Category" value="Impala"/> @@ -11,6 +11,8 @@ <data name="Category" value="Impala Functions"/> <data name="Category" value="Aggregate Functions"/> <data name="Category" value="Querying"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_hadoop.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_hadoop.xml b/docs/topics/impala_hadoop.xml index a3700c6..7941a47 100644 --- a/docs/topics/impala_hadoop.xml +++ b/docs/topics/impala_hadoop.xml @@ -4,7 +4,16 @@ <title>How Impala Fits Into the Hadoop Ecosystem</title> <titlealts audience="PDF"><navtitle>Role in the Hadoop Ecosystem</navtitle></titlealts> - + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Concepts"/> + <data name="Category" value="Hadoop"/> + <data name="Category" value="Administrators"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> <conbody> @@ -14,7 +23,143 @@ ELT pipelines. </p> - + <p outputclass="toc inpage"/> + </conbody> + + <concept id="intro_hive"> + + <title>How Impala Works with Hive</title> + + <conbody> + + <p> + A major Impala goal is to make SQL-on-Hadoop operations fast and efficient enough to appeal to new + categories of users and open up Hadoop to new types of use cases. Where practical, it makes use of existing + Apache Hive infrastructure that many Hadoop users already have in place to perform long-running, + batch-oriented SQL queries. + </p> + + <p> + In particular, Impala keeps its table definitions in a traditional MySQL or PostgreSQL database known as + the <b>metastore</b>, the same database where Hive keeps this type of data. Thus, Impala can access tables + defined or loaded by Hive, as long as all columns use Impala-supported data types, file formats, and + compression codecs. + </p> + + <p> + The initial focus on query features and performance means that Impala can read more types of data with the + <codeph>SELECT</codeph> statement than it can write with the <codeph>INSERT</codeph> statement. To query + data using the Avro, RCFile, or SequenceFile <xref href="impala_file_formats.xml#file_formats">file + formats</xref>, you load the data using Hive. + </p> + + <p rev="1.2.2"> + The Impala query optimizer can also make use of <xref href="impala_perf_stats.xml#perf_table_stats">table + statistics</xref> and <xref href="impala_perf_stats.xml#perf_column_stats">column statistics</xref>. + Originally, you gathered this information with the <codeph>ANALYZE TABLE</codeph> statement in Hive; in + Impala 1.2.2 and higher, use the Impala <codeph><xref href="impala_compute_stats.xml#compute_stats">COMPUTE + STATS</xref></codeph> statement instead. <codeph>COMPUTE STATS</codeph> requires less setup, is more + reliable, and does not require switching back and forth between <cmdname>impala-shell</cmdname> + and the Hive shell. + </p> + </conbody> + </concept> + + <concept id="intro_metastore"> + + <title>Overview of Impala Metadata and the Metastore</title> + <prolog> + <metadata> + <data name="Category" value="Concepts"/> + <data name="Category" value="Impala"/> + <data name="Category" value="Hive"/> + </metadata> + </prolog> + + <conbody> + + <p> + As discussed in <xref href="impala_hadoop.xml#intro_hive"/>, Impala maintains information about table + definitions in a central database known as the <b>metastore</b>. Impala also tracks other metadata for the + low-level characteristics of data files: + </p> + + <ul> + <li> + The physical locations of blocks within HDFS. + </li> + </ul> + + <p> + For tables with a large volume of data and/or many partitions, retrieving all the metadata for a table can + be time-consuming, taking minutes in some cases. Thus, each Impala node caches all of this metadata to + reuse for future queries against the same table. + </p> + + <p rev="1.2"> + If the table definition or the data in the table is updated, all other Impala daemons in the cluster must + receive the latest metadata, replacing the obsolete cached metadata, before issuing a query against that + table. In Impala 1.2 and higher, the metadata update is automatic, coordinated through the + <cmdname>catalogd</cmdname> daemon, for all DDL and DML statements issued through Impala. See + <xref href="impala_components.xml#intro_catalogd"/> for details. + </p> + + <p> + For DDL and DML issued through Hive, or changes made manually to files in HDFS, you still use the + <codeph>REFRESH</codeph> statement (when new data files are added to existing tables) or the + <codeph>INVALIDATE METADATA</codeph> statement (for entirely new tables, or after dropping a table, + performing an HDFS rebalance operation, or deleting data files). Issuing <codeph>INVALIDATE + METADATA</codeph> by itself retrieves metadata for all the tables tracked by the metastore. If you know + that only specific tables have been changed outside of Impala, you can issue <codeph>REFRESH + <varname>table_name</varname></codeph> for each affected table to only retrieve the latest metadata for + those tables. + </p> + </conbody> + </concept> + + <concept id="intro_hdfs"> + + <title>How Impala Uses HDFS</title> + <prolog> + <metadata> + <data name="Category" value="Concepts"/> + <data name="Category" value="Impala"/> + <data name="Category" value="HDFS"/> + </metadata> + </prolog> + + <conbody> + + <p> + Impala uses the distributed filesystem HDFS as its primary data storage medium. Impala relies on the + redundancy provided by HDFS to guard against hardware or network outages on individual nodes. Impala table + data is physically represented as data files in HDFS, using familiar HDFS file formats and compression + codecs. When data files are present in the directory for a new table, Impala reads them all, regardless of + file name. New data is added in files with names controlled by Impala. + </p> </conbody> </concept> + <concept id="intro_hbase"> + + <title>How Impala Uses HBase</title> + <prolog> + <metadata> + <data name="Category" value="Concepts"/> + <data name="Category" value="Impala"/> + <data name="Category" value="HBase"/> + </metadata> + </prolog> + + <conbody> + + <p> + HBase is an alternative to HDFS as a storage medium for Impala data. It is a database storage system built + on top of HDFS, without built-in SQL support. Many Hadoop users already have it configured and store large + (often sparse) data sets in it. By defining tables in Impala and mapping them to equivalent tables in + HBase, you can query the contents of the HBase tables through Impala, and even perform join queries + including both Impala and HBase tables. See <xref href="impala_hbase.xml#impala_hbase"/> for details. + </p> + </conbody> + </concept> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3c2c8f12/docs/topics/impala_having.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_having.xml b/docs/topics/impala_having.xml index 064a4a8..2de32bb 100644 --- a/docs/topics/impala_having.xml +++ b/docs/topics/impala_having.xml @@ -9,6 +9,8 @@ <data name="Category" value="SQL"/> <data name="Category" value="Querying"/> <data name="Category" value="Aggregate Functions"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> </metadata> </prolog>
