http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_avro.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_avro.xml b/docs/topics/impala_avro.xml new file mode 100644 index 0000000..04e84aa --- /dev/null +++ b/docs/topics/impala_avro.xml @@ -0,0 +1,556 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="avro"> + + <title>Using the Avro File Format with Impala Tables</title> + <titlealts audience="PDF"><navtitle>Avro Data Files</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="File Formats"/> + <data name="Category" value="Avro"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p rev="1.4.0"> + <indexterm audience="Cloudera">Avro support in Impala</indexterm> + Impala supports using tables whose data files use the Avro file format. Impala can query Avro + tables, and in Impala 1.4.0 and higher can create them, but currently cannot insert data into them. For + insert operations, use Hive, then switch back to Impala to run queries. + </p> + + <table> + <title>Avro Format Support in Impala</title> + <tgroup cols="5"> + <colspec colname="1" colwidth="10*"/> + <colspec colname="2" colwidth="10*"/> + <colspec colname="3" colwidth="20*"/> + <colspec colname="4" colwidth="30*"/> + <colspec colname="5" colwidth="30*"/> + <thead> + <row> + <entry> + File Type + </entry> + <entry> + Format + </entry> + <entry> + Compression Codecs + </entry> + <entry> + Impala Can CREATE? + </entry> + <entry> + Impala Can INSERT? + </entry> + </row> + </thead> + <tbody> + <row conref="impala_file_formats.xml#file_formats/avro_support"> + <entry/> + </row> + </tbody> + </tgroup> + </table> + + <p outputclass="toc inpage"/> + </conbody> + + <concept id="avro_create_table"> + + <title>Creating Avro Tables</title> + + <conbody> + + <p> + To create a new table using the Avro file format, issue the <codeph>CREATE TABLE</codeph> statement through + Impala with the <codeph>STORED AS AVRO</codeph> clause, or through Hive. If you create the table through + Impala, you must include column definitions that match the fields specified in the Avro schema. With Hive, + you can omit the columns and just specify the Avro schema. + </p> + + <p rev="2.3.0"> + In <keyword keyref="impala23_full"/> and higher, the <codeph>CREATE TABLE</codeph> for Avro tables can include + SQL-style column definitions rather than specifying Avro notation through the <codeph>TBLPROPERTIES</codeph> + clause. Impala issues warning messages if there are any mismatches between the types specified in the + SQL column definitions and the underlying types; for example, any <codeph>TINYINT</codeph> or + <codeph>SMALLINT</codeph> columns are treated as <codeph>INT</codeph> in the underlying Avro files, + and therefore are displayed as <codeph>INT</codeph> in any <codeph>DESCRIBE</codeph> or + <codeph>SHOW CREATE TABLE</codeph> output. + </p> + + <note> + <p conref="../shared/impala_common.xml#common/avro_no_timestamp"/> + </note> + + <!-- + To do: Expand these examples to show switching between impala-shell and Hive, loading some data, and then + doing DESCRIBE and querying the table. + --> + + <p> + The following examples demonstrate creating an Avro table in Impala, using either an inline column + specification or one taken from a JSON file stored in HDFS: + </p> + +<codeblock><![CDATA[ +[localhost:21000] > CREATE TABLE avro_only_sql_columns + > ( + > id INT, + > bool_col BOOLEAN, + > tinyint_col TINYINT, /* Gets promoted to INT */ + > smallint_col SMALLINT, /* Gets promoted to INT */ + > int_col INT, + > bigint_col BIGINT, + > float_col FLOAT, + > double_col DOUBLE, + > date_string_col STRING, + > string_col STRING + > ) + > STORED AS AVRO; + +[localhost:21000] > CREATE TABLE impala_avro_table + > (bool_col BOOLEAN, int_col INT, long_col BIGINT, float_col FLOAT, double_col DOUBLE, string_col STRING, nullable_int INT) + > STORED AS AVRO + > TBLPROPERTIES ('avro.schema.literal'='{ + > "name": "my_record", + > "type": "record", + > "fields": [ + > {"name":"bool_col", "type":"boolean"}, + > {"name":"int_col", "type":"int"}, + > {"name":"long_col", "type":"long"}, + > {"name":"float_col", "type":"float"}, + > {"name":"double_col", "type":"double"}, + > {"name":"string_col", "type":"string"}, + > {"name": "nullable_int", "type": ["null", "int"]}]}'); + +[localhost:21000] > CREATE TABLE avro_examples_of_all_types ( + > id INT, + > bool_col BOOLEAN, + > tinyint_col TINYINT, + > smallint_col SMALLINT, + > int_col INT, + > bigint_col BIGINT, + > float_col FLOAT, + > double_col DOUBLE, + > date_string_col STRING, + > string_col STRING + > ) + > STORED AS AVRO + > TBLPROPERTIES ('avro.schema.url'='hdfs://localhost:8020/avro_schemas/alltypes.json'); +]]> +</codeblock> + + <p> + The following example demonstrates creating an Avro table in Hive: + </p> + +<codeblock><![CDATA[ +hive> CREATE TABLE hive_avro_table + > ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' + > STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' + > OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' + > TBLPROPERTIES ('avro.schema.literal'='{ + > "name": "my_record", + > "type": "record", + > "fields": [ + > {"name":"bool_col", "type":"boolean"}, + > {"name":"int_col", "type":"int"}, + > {"name":"long_col", "type":"long"}, + > {"name":"float_col", "type":"float"}, + > {"name":"double_col", "type":"double"}, + > {"name":"string_col", "type":"string"}, + > {"name": "nullable_int", "type": ["null", "int"]}]}'); +]]> +</codeblock> + + <p> + Each field of the record becomes a column of the table. Note that any other information, such as the record + name, is ignored. + </p> + +<!-- Have not got a working example of this syntax yet from Lenni. +<p> +The schema can be specified either through the <codeph>TBLPROPERTIES</codeph> clause or the +<codeph>WITH SERDEPROPERTIES</codeph> clause. +For best compatibility with future versions of Hive, use the <codeph>WITH SERDEPROPERTIES</codeph> clause +for this information. +</p> +--> + + <note> + For nullable Avro columns, make sure to put the <codeph>"null"</codeph> entry before the actual type name. + In Impala, all columns are nullable; Impala currently does not have a <codeph>NOT NULL</codeph> clause. Any + non-nullable property is only enforced on the Avro side. + </note> + + <p> + Most column types map directly from Avro to Impala under the same names. These are the exceptions and + special cases to consider: + </p> + + <ul> + <li> + The <codeph>DECIMAL</codeph> type is defined in Avro as a <codeph>BYTE</codeph> type with the + <codeph>logicalType</codeph> property set to <codeph>"decimal"</codeph> and a specified precision and + scale. Use <codeph>DECIMAL</codeph> in Avro tables only under CDH 5. The infrastructure and components + under CDH 4 do not have reliable <codeph>DECIMAL</codeph> support. + </li> + + <li> + The Avro <codeph>long</codeph> type maps to <codeph>BIGINT</codeph> in Impala. + </li> + </ul> + + <p> + If you create the table through Hive, switch back to <cmdname>impala-shell</cmdname> and issue an + <codeph>INVALIDATE METADATA <varname>table_name</varname></codeph> statement. Then you can run queries for + that table through <cmdname>impala-shell</cmdname>. + </p> + + <p rev="2.3.0"> + In rare instances, a mismatch could occur between the Avro schema and the column definitions in the + metastore database. In <keyword keyref="impala23_full"/> and higher, Impala checks for such inconsistencies during + a <codeph>CREATE TABLE</codeph> statement and each time it loads the metadata for a table (for example, + after <codeph>INVALIDATE METADATA</codeph>). Impala uses the following rules to determine how to treat + mismatching columns, a process known as <term>schema reconciliation</term>: + <ul> + <li> + If there is a mismatch in the number of columns, Impala uses the column + definitions from the Avro schema. + </li> + <li> + If there is a mismatch in column name or type, Impala uses the column definition from the Avro schema. + Because a <codeph>CHAR</codeph> or <codeph>VARCHAR</codeph> column in Impala maps to an Avro <codeph>STRING</codeph>, + this case is not considered a mismatch and the column is preserved as <codeph>CHAR</codeph> or <codeph>VARCHAR</codeph> + in the reconciled schema. <ph rev="2.7.0 IMPALA-3687 CDH-43731">Prior to <keyword keyref="impala27_full"/> the column + name and comment for such <codeph>CHAR</codeph> and <codeph>VARCHAR</codeph> columns was also taken from the SQL column definition. + In <keyword keyref="impala27_full"/> and higher, the column name and comment from the Avro schema file take precedence for such columns, + and only the <codeph>CHAR</codeph> or <codeph>VARCHAR</codeph> type is preserved from the SQL column definition.</ph> + </li> + <li> + An Impala <codeph>TIMESTAMP</codeph> column definition maps to an Avro <codeph>STRING</codeph> and is presented as a <codeph>STRING</codeph> + in the reconciled schema, because Avro has no binary <codeph>TIMESTAMP</codeph> representation. + As a result, no Avro table can have a <codeph>TIMESTAMP</codeph> column; this restriction is the same as + in earlier CDH and Impala releases. + </li> + </ul> + </p> + + <p conref="../shared/impala_common.xml#common/complex_types_unsupported_filetype"/> + + </conbody> + </concept> + + <concept id="avro_map_table"> + + <title>Using a Hive-Created Avro Table in Impala</title> + + <conbody> + + <p> + If you have an Avro table created through Hive, you can use it in Impala as long as it contains only + Impala-compatible data types. It cannot contain: + <ul> + <li> + Complex types: <codeph>array</codeph>, <codeph>map</codeph>, <codeph>record</codeph>, + <codeph>struct</codeph>, <codeph>union</codeph> other than + <codeph>[<varname>supported_type</varname>,null]</codeph> or + <codeph>[null,<varname>supported_type</varname>]</codeph> + </li> + + <li> + The Avro-specific types <codeph>enum</codeph>, <codeph>bytes</codeph>, and <codeph>fixed</codeph> + </li> + + <li> + Any scalar type other than those listed in <xref href="impala_datatypes.xml#datatypes"/> + </li> + </ul> + Because Impala and Hive share the same metastore database, Impala can directly access the table definitions + and data for tables that were created in Hive. + </p> + + <p> + If you create an Avro table in Hive, issue an <codeph>INVALIDATE METADATA</codeph> the next time you + connect to Impala through <cmdname>impala-shell</cmdname>. This is a one-time operation to make Impala + aware of the new table. You can issue the statement while connected to any Impala node, and the catalog + service broadcasts the change to all other Impala nodes. + </p> + + <p> + If you load new data into an Avro table through Hive, either through a Hive <codeph>LOAD DATA</codeph> or + <codeph>INSERT</codeph> statement, or by manually copying or moving files into the data directory for the + table, issue a <codeph>REFRESH <varname>table_name</varname></codeph> statement the next time you connect + to Impala through <cmdname>impala-shell</cmdname>. You can issue the statement while connected to any + Impala node, and the catalog service broadcasts the change to all other Impala nodes. If you issue the + <codeph>LOAD DATA</codeph> statement through Impala, you do not need a <codeph>REFRESH</codeph> afterward. + </p> + + <p> + Impala only supports fields of type <codeph>boolean</codeph>, <codeph>int</codeph>, <codeph>long</codeph>, + <codeph>float</codeph>, <codeph>double</codeph>, and <codeph>string</codeph>, or unions of these types with + null; for example, <codeph>["string", "null"]</codeph>. Unions with <codeph>null</codeph> essentially + create a nullable type. + </p> + </conbody> + </concept> + + <concept id="avro_json"> + + <title>Specifying the Avro Schema through JSON</title> + + <conbody> + + <p> + While you can embed a schema directly in your <codeph>CREATE TABLE</codeph> statement, as shown above, + column width restrictions in the Hive metastore limit the length of schema you can specify. If you + encounter problems with long schema literals, try storing your schema as a <codeph>JSON</codeph> file in + HDFS instead. Specify your schema in HDFS using table properties similar to the following: + </p> + +<codeblock>tblproperties ('avro.schema.url'='hdfs//your-name-node:port/path/to/schema.json');</codeblock> + </conbody> + </concept> + + <concept id="avro_load_data"> + + <title>Loading Data into an Avro Table</title> + <prolog> + <metadata> + <data name="Category" value="ETL"/> + <data name="Category" value="Ingest"/> + </metadata> + </prolog> + + <conbody> + + <p rev="DOCS-1523"> + Currently, Impala cannot write Avro data files. Therefore, an Avro table cannot be used as the destination + of an Impala <codeph>INSERT</codeph> statement or <codeph>CREATE TABLE AS SELECT</codeph>. + </p> + + <p> + To copy data from another table, issue any <codeph>INSERT</codeph> statements through Hive. For information + about loading data into Avro tables through Hive, see + <xref href="https://cwiki.apache.org/confluence/display/Hive/AvroSerDe" scope="external" format="html">Avro + page on the Hive wiki</xref>. + </p> + + <p> + If you already have data files in Avro format, you can also issue <codeph>LOAD DATA</codeph> in either + Impala or Hive. Impala can move existing Avro data files into an Avro table, it just cannot create new + Avro data files. + </p> + + </conbody> + </concept> + + <concept id="avro_compression"> + + <title>Enabling Compression for Avro Tables</title> + <prolog> + <metadata> + <data name="Category" value="Compression"/> + <data name="Category" value="Snappy"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">compression</indexterm> + To enable compression for Avro tables, specify settings in the Hive shell to enable compression and to + specify a codec, then issue a <codeph>CREATE TABLE</codeph> statement as in the preceding examples. Impala + supports the <codeph>snappy</codeph> and <codeph>deflate</codeph> codecs for Avro tables. + </p> + + <p> + For example: + </p> + +<codeblock>hive> set hive.exec.compress.output=true; +hive> set avro.output.codec=snappy;</codeblock> + </conbody> + </concept> + + <concept rev="1.1" id="avro_schema_evolution"> + + <title>How Impala Handles Avro Schema Evolution</title> + <prolog> + <metadata> + <data name="Category" value="Concepts"/> + </metadata> + </prolog> + + <conbody> + + <p> + Starting in Impala 1.1, Impala can deal with Avro data files that employ <term>schema evolution</term>, + where different data files within the same table use slightly different type definitions. (You would + perform the schema evolution operation by issuing an <codeph>ALTER TABLE</codeph> statement in the Hive + shell.) The old and new types for any changed columns must be compatible, for example a column might start + as an <codeph>int</codeph> and later change to a <codeph>bigint</codeph> or <codeph>float</codeph>. + </p> + + <p> + As with any other tables where the definitions are changed or data is added outside of the current + <cmdname>impalad</cmdname> node, ensure that Impala loads the latest metadata for the table if the Avro + schema is modified through Hive. Issue a <codeph>REFRESH <varname>table_name</varname></codeph> or + <codeph>INVALIDATE METADATA <varname>table_name</varname></codeph> statement. <codeph>REFRESH</codeph> + reloads the metadata immediately, <codeph>INVALIDATE METADATA</codeph> reloads the metadata the next time + the table is accessed. + </p> + + <p> + When Avro data files or columns are not consulted during a query, Impala does not check for consistency. + Thus, if you issue <codeph>SELECT c1, c2 FROM t1</codeph>, Impala does not return any error if the column + <codeph>c3</codeph> changed in an incompatible way. If a query retrieves data from some partitions but not + others, Impala does not check the data files for the unused partitions. + </p> + + <p> + In the Hive DDL statements, you can specify an <codeph>avro.schema.literal</codeph> table property (if the + schema definition is short) or an <codeph>avro.schema.url</codeph> property (if the schema definition is + long, or to allow convenient editing for the definition). + </p> + + <p> + For example, running the following SQL code in the Hive shell creates a table using the Avro file format + and puts some sample data into it: + </p> + +<codeblock>CREATE TABLE avro_table (a string, b string) +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' +STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' +OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' +TBLPROPERTIES ( + 'avro.schema.literal'='{ + "type": "record", + "name": "my_record", + "fields": [ + {"name": "a", "type": "int"}, + {"name": "b", "type": "string"} + ]}'); + +INSERT OVERWRITE TABLE avro_table SELECT 1, "avro" FROM functional.alltypes LIMIT 1; +</codeblock> + + <p> + Once the Avro table is created and contains data, you can query it through the + <cmdname>impala-shell</cmdname> command: + </p> + +<codeblock>[localhost:21000] > select * from avro_table; ++---+------+ +| a | b | ++---+------+ +| 1 | avro | ++---+------+ +</codeblock> + + <p> + Now in the Hive shell, you change the type of a column and add a new column with a default value: + </p> + +<codeblock>-- Promote column "a" from INT to FLOAT (no need to update Avro schema) +ALTER TABLE avro_table CHANGE A A FLOAT; + +-- Add column "c" with default +ALTER TABLE avro_table ADD COLUMNS (c int); +ALTER TABLE avro_table SET TBLPROPERTIES ( + 'avro.schema.literal'='{ + "type": "record", + "name": "my_record", + "fields": [ + {"name": "a", "type": "int"}, + {"name": "b", "type": "string"}, + {"name": "c", "type": "int", "default": 10} + ]}'); +</codeblock> + + <p> + Once again in <cmdname>impala-shell</cmdname>, you can query the Avro table based on its latest schema + definition. Because the table metadata was changed outside of Impala, you issue a <codeph>REFRESH</codeph> + statement first so that Impala has up-to-date metadata for the table. + </p> + +<codeblock>[localhost:21000] > refresh avro_table; +[localhost:21000] > select * from avro_table; ++---+------+----+ +| a | b | c | ++---+------+----+ +| 1 | avro | 10 | ++---+------+----+ +</codeblock> + </conbody> + </concept> + + <concept id="avro_data_types"> + + <title>Data Type Considerations for Avro Tables</title> + + <conbody> + + <p> + The Avro format defines a set of data types whose names differ from the names of the corresponding Impala + data types. If you are preparing Avro files using other Hadoop components such as Pig or MapReduce, you + might need to work with the type names defined by Avro. The following figure lists the Avro-defined types + and the equivalent types in Impala. + </p> + +<codeblock><![CDATA[Primitive Types (Avro -> Impala) +-------------------------------- +STRING -> STRING +STRING -> CHAR +STRING -> VARCHAR +INT -> INT +BOOLEAN -> BOOLEAN +LONG -> BIGINT +FLOAT -> FLOAT +DOUBLE -> DOUBLE + +Logical Types +------------- +BYTES + logicalType = "decimal" -> DECIMAL + +Avro Types with No Impala Equivalent +------------------------------------ +RECORD, MAP, ARRAY, UNION, ENUM, FIXED, NULL + +Impala Types with No Avro Equivalent +------------------------------------ +TIMESTAMP +]]> +</codeblock> + + <p conref="../shared/impala_common.xml#common/avro_2gb_strings"/> + + </conbody> + </concept> + + <concept id="avro_performance"> + + <title>Query Performance for Impala Avro Tables</title> + + <conbody> + + <p> + In general, expect query performance with Avro tables to be + faster than with tables using text data, but slower than with + Parquet tables. See <xref href="impala_parquet.xml#parquet"/> + for information about using the Parquet file format for + high-performance analytic queries. + </p> + + <p conref="../shared/impala_common.xml#common/s3_block_splitting"/> + + </conbody> + </concept> + +</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_batch_size.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_batch_size.xml b/docs/topics/impala_batch_size.xml new file mode 100644 index 0000000..eb54b14 --- /dev/null +++ b/docs/topics/impala_batch_size.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="batch_size"> + + <title>BATCH_SIZE Query Option</title> + <titlealts audience="PDF"><navtitle>BATCH_SIZE</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + <data name="Category" value="Querying"/> + <data name="Category" value="Performance"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">BATCH_SIZE query option</indexterm> + Number of rows evaluated at a time by SQL operators. Unspecified or a size of 0 uses a predefined default + size. Using a large number improves responsiveness, especially for scan operations, at the cost of a higher memory footprint. + </p> + + <p> + This option is primarily for testing during Impala development, or for use under the direction of <keyword keyref="support_org"/>. + </p> + + <p> + <b>Type:</b> numeric + </p> + + <p> + <b>Default:</b> 0 (meaning the predefined default of 1024) + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_bigint.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_bigint.xml b/docs/topics/impala_bigint.xml new file mode 100644 index 0000000..d9223b0 --- /dev/null +++ b/docs/topics/impala_bigint.xml @@ -0,0 +1,102 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="bigint"> + + <title>BIGINT Data Type</title> + <titlealts audience="PDF"><navtitle>BIGINT</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Data Types"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Data Analysts"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Schemas"/> + </metadata> + </prolog> + + <conbody> + + <p> + An 8-byte integer data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph> + statements. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + + <p> + In the column definition of a <codeph>CREATE TABLE</codeph> statement: + </p> + +<codeblock><varname>column_name</varname> BIGINT</codeblock> + + <p> + <b>Range:</b> -9223372036854775808 .. 9223372036854775807. There is no <codeph>UNSIGNED</codeph> subtype. + </p> + + <p> + <b>Conversions:</b> Impala automatically converts to a floating-point type (<codeph>FLOAT</codeph> or + <codeph>DOUBLE</codeph>) automatically. Use <codeph>CAST()</codeph> to convert to <codeph>TINYINT</codeph>, + <codeph>SMALLINT</codeph>, <codeph>INT</codeph>, <codeph>STRING</codeph>, or <codeph>TIMESTAMP</codeph>. + <ph conref="../shared/impala_common.xml#common/cast_int_to_timestamp"/> + </p> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + +<codeblock>CREATE TABLE t1 (x BIGINT); +SELECT CAST(1000 AS BIGINT); +</codeblock> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + <codeph>BIGINT</codeph> is a convenient type to use for column declarations because you can use any kind of + integer values in <codeph>INSERT</codeph> statements and they are promoted to <codeph>BIGINT</codeph> where + necessary. However, <codeph>BIGINT</codeph> also requires the most bytes of any integer type on disk and in + memory, meaning your queries are not as efficient and scalable as possible if you overuse this type. + Therefore, prefer to use the smallest integer type with sufficient range to hold all input values, and + <codeph>CAST()</codeph> when necessary to the appropriate type. + </p> + + <p> + For a convenient and automated way to check the bounds of the <codeph>BIGINT</codeph> type, call the + functions <codeph>MIN_BIGINT()</codeph> and <codeph>MAX_BIGINT()</codeph>. + </p> + + <p> + If an integer value is too large to be represented as a <codeph>BIGINT</codeph>, use a + <codeph>DECIMAL</codeph> instead with sufficient digits of precision. + </p> + + <p conref="../shared/impala_common.xml#common/null_bad_numeric_cast"/> + + <p conref="../shared/impala_common.xml#common/partitioning_good"/> + + <p conref="../shared/impala_common.xml#common/hbase_ok"/> + +<!-- <p conref="../shared/impala_common.xml#common/parquet_blurb"/> --> + + <p conref="../shared/impala_common.xml#common/text_bulky"/> + +<!-- <p conref="../shared/impala_common.xml#common/compatibility_blurb"/> --> + + <p conref="../shared/impala_common.xml#common/internals_8_bytes"/> + + <p conref="../shared/impala_common.xml#common/added_forever"/> + + <p conref="../shared/impala_common.xml#common/column_stats_constant"/> + + <p conref="../shared/impala_common.xml#common/sqoop_blurb"/> + + <p conref="../shared/impala_common.xml#common/sqoop_timestamp_caveat"/> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_literals.xml#numeric_literals"/>, <xref href="impala_tinyint.xml#tinyint"/>, + <xref href="impala_smallint.xml#smallint"/>, <xref href="impala_int.xml#int"/>, + <xref href="impala_bigint.xml#bigint"/>, <xref href="impala_decimal.xml#decimal"/>, + <xref href="impala_math_functions.xml#math_functions"/> + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_bit_functions.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_bit_functions.xml b/docs/topics/impala_bit_functions.xml new file mode 100644 index 0000000..2ac0f68 --- /dev/null +++ b/docs/topics/impala_bit_functions.xml @@ -0,0 +1,794 @@ +<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="bit_functions" rev="2.3.0"> + + <title>Impala Bit Functions</title> + <titlealts audience="PDF"><navtitle>Bit Functions</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Functions"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Data Analysts"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Querying"/> + </metadata> + </prolog> + + <conbody> + + <p rev="2.3.0"> + Bit manipulation functions perform bitwise operations involved in scientific processing or computer science algorithms. + For example, these functions include setting, clearing, or testing bits within an integer value, or changing the + positions of bits with or without wraparound. + </p> + + <p> + If a function takes two integer arguments that are required to be of the same type, the smaller argument is promoted + to the type of the larger one if required. For example, <codeph>BITAND(1,4096)</codeph> treats both arguments as + <codeph>SMALLINT</codeph>, because 1 can be represented as a <codeph>TINYINT</codeph> but 4096 requires a <codeph>SMALLINT</codeph>. + </p> + + <p> + Remember that all Impala integer values are signed. Therefore, when dealing with binary values where the most significant + bit is 1, the specified or returned values might be negative when represented in base 10. + </p> + + <p> + Whenever any argument is <codeph>NULL</codeph>, either the input value, bit position, or number of shift or rotate positions, + the return value from any of these functions is also <codeph>NULL</codeph> + </p> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + The bit functions operate on all the integral data types: <xref href="impala_int.xml#int"/>, + <xref href="impala_bigint.xml#bigint"/>, <xref href="impala_smallint.xml#smallint"/>, and + <xref href="impala_tinyint.xml#tinyint"/>. + </p> + + <p> + <b>Function reference:</b> + </p> + + <p> + Impala supports the following bit functions: + </p> + +<!-- +bitand +bitnot +bitor +bitxor +countset +getbit +rotateleft +rotateright +setbit +shiftleft +shiftright +--> + + <dl> + + <dlentry id="bitand"> + + <dt> + <codeph>bitand(integer_type a, same_type b)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">bitand() function</indexterm> + <b>Purpose:</b> Returns an integer value representing the bits that are set to 1 in both of the arguments. + If the arguments are of different sizes, the smaller is promoted to the type of the larger. + <p> + <b>Usage notes:</b> The <codeph>bitand()</codeph> function is equivalent to the <codeph>&</codeph> binary operator. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> + <p> + The following examples show the results of ANDing integer values. + 255 contains all 1 bits in its lowermost 7 bits. + 32767 contains all 1 bits in its lowermost 15 bits. + <!-- + Negative numbers have a 1 in the sign bit and the value is the + <xref href="https://en.wikipedia.org/wiki/Two%27s_complement" scope="external" format="html">two's complement</xref> + of the positive equivalent. + --> + You can use the <codeph>bin()</codeph> function to check the binary representation of any + integer value, although the result is always represented as a 64-bit value. + If necessary, the smaller argument is promoted to the + type of the larger one. + </p> +<codeblock>select bitand(255, 32767); /* 0000000011111111 & 0111111111111111 */ ++--------------------+ +| bitand(255, 32767) | ++--------------------+ +| 255 | ++--------------------+ + +select bitand(32767, 1); /* 0111111111111111 & 0000000000000001 */ ++------------------+ +| bitand(32767, 1) | ++------------------+ +| 1 | ++------------------+ + +select bitand(32, 16); /* 00010000 & 00001000 */ ++----------------+ +| bitand(32, 16) | ++----------------+ +| 0 | ++----------------+ + +select bitand(12,5); /* 00001100 & 00000101 */ ++---------------+ +| bitand(12, 5) | ++---------------+ +| 4 | ++---------------+ + +select bitand(-1,15); /* 11111111 & 00001111 */ ++----------------+ +| bitand(-1, 15) | ++----------------+ +| 15 | ++----------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="bitnot"> + + <dt> + <codeph>bitnot(integer_type a)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">bitnot() function</indexterm> + <b>Purpose:</b> Inverts all the bits of the input argument. + <p> + <b>Usage notes:</b> The <codeph>bitnot()</codeph> function is equivalent to the <codeph>~</codeph> unary operator. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> + <p> + These examples illustrate what happens when you flip all the bits of an integer value. + The sign always changes. The decimal representation is one different between the positive and + negative values. + <!-- + because negative values are represented as the + <xref href="https://en.wikipedia.org/wiki/Two%27s_complement" scope="external" format="html">two's complement</xref> + of the corresponding positive value. + --> + </p> +<codeblock>select bitnot(127); /* 01111111 -> 10000000 */ ++-------------+ +| bitnot(127) | ++-------------+ +| -128 | ++-------------+ + +select bitnot(16); /* 00010000 -> 11101111 */ ++------------+ +| bitnot(16) | ++------------+ +| -17 | ++------------+ + +select bitnot(0); /* 00000000 -> 11111111 */ ++-----------+ +| bitnot(0) | ++-----------+ +| -1 | ++-----------+ + +select bitnot(-128); /* 10000000 -> 01111111 */ ++--------------+ +| bitnot(-128) | ++--------------+ +| 127 | ++--------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="bitor"> + + <dt> + <codeph>bitor(integer_type a, same_type b)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">bitor() function</indexterm> + <b>Purpose:</b> Returns an integer value representing the bits that are set to 1 in either of the arguments. + If the arguments are of different sizes, the smaller is promoted to the type of the larger. + <p> + <b>Usage notes:</b> The <codeph>bitor()</codeph> function is equivalent to the <codeph>|</codeph> binary operator. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> + <p> + The following examples show the results of ORing integer values. + </p> +<codeblock>select bitor(1,4); /* 00000001 | 00000100 */ ++-------------+ +| bitor(1, 4) | ++-------------+ +| 5 | ++-------------+ + +select bitor(16,48); /* 00001000 | 00011000 */ ++---------------+ +| bitor(16, 48) | ++---------------+ +| 48 | ++---------------+ + +select bitor(0,7); /* 00000000 | 00000111 */ ++-------------+ +| bitor(0, 7) | ++-------------+ +| 7 | ++-------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="bitxor"> + + <dt> + <codeph>bitxor(integer_type a, same_type b)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">bitxor() function</indexterm> + <b>Purpose:</b> Returns an integer value representing the bits that are set to 1 in one but not both of the arguments. + If the arguments are of different sizes, the smaller is promoted to the type of the larger. + <p> + <b>Usage notes:</b> The <codeph>bitxor()</codeph> function is equivalent to the <codeph>^</codeph> binary operator. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> + <p> + The following examples show the results of XORing integer values. + XORing a non-zero value with zero returns the non-zero value. + XORing two identical values returns zero, because all the 1 bits from the first argument are also 1 bits in the second argument. + XORing different non-zero values turns off some bits and leaves others turned on, based on whether the same bit is set in both arguments. + </p> +<codeblock>select bitxor(0,15); /* 00000000 ^ 00001111 */ ++---------------+ +| bitxor(0, 15) | ++---------------+ +| 15 | ++---------------+ + +select bitxor(7,7); /* 00000111 ^ 00000111 */ ++--------------+ +| bitxor(7, 7) | ++--------------+ +| 0 | ++--------------+ + +select bitxor(8,4); /* 00001000 ^ 00000100 */ ++--------------+ +| bitxor(8, 4) | ++--------------+ +| 12 | ++--------------+ + +select bitxor(3,7); /* 00000011 ^ 00000111 */ ++--------------+ +| bitxor(3, 7) | ++--------------+ +| 4 | ++--------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="countset"> + + <dt> + <codeph>countset(integer_type a [, int zero_or_one])</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">countset() function</indexterm> + <b>Purpose:</b> By default, returns the number of 1 bits in the specified integer value. + If the optional second argument is set to zero, it returns the number of 0 bits instead. + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + <p> + In discussions of information theory, this operation is referred to as the + <q><xref href="https://en.wikipedia.org/wiki/Hamming_weight" scope="external" format="html">population count</xref></q> + or <q>popcount</q>. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> + <p> + The following examples show how to count the number of 1 bits in an integer value. + </p> +<codeblock>select countset(1); /* 00000001 */ ++-------------+ +| countset(1) | ++-------------+ +| 1 | ++-------------+ + +select countset(3); /* 00000011 */ ++-------------+ +| countset(3) | ++-------------+ +| 2 | ++-------------+ + +select countset(16); /* 00010000 */ ++--------------+ +| countset(16) | ++--------------+ +| 1 | ++--------------+ + +select countset(17); /* 00010001 */ ++--------------+ +| countset(17) | ++--------------+ +| 2 | ++--------------+ + +select countset(7,1); /* 00000111 = 3 1 bits; the function counts 1 bits by default */ ++----------------+ +| countset(7, 1) | ++----------------+ +| 3 | ++----------------+ + +select countset(7,0); /* 00000111 = 5 0 bits; third argument can only be 0 or 1 */ ++----------------+ +| countset(7, 0) | ++----------------+ +| 5 | ++----------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="getbit"> + + <dt> + <codeph>getbit(integer_type a, int position)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">getbit() function</indexterm> + <b>Purpose:</b> Returns a 0 or 1 representing the bit at a + specified position. The positions are numbered right to left, starting at zero. + The position argument cannot be negative. + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + <p> + When you use a literal input value, it is treated as an 8-bit, 16-bit, + and so on value, the smallest type that is appropriate. + The type of the input value limits the range of the positions. + Cast the input value to the appropriate type if you need to + ensure it is treated as a 64-bit, 32-bit, and so on value. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> + <p> + The following examples show how to test a specific bit within an integer value. + </p> +<codeblock>select getbit(1,0); /* 00000001 */ ++--------------+ +| getbit(1, 0) | ++--------------+ +| 1 | ++--------------+ + +select getbit(16,1) /* 00010000 */ ++---------------+ +| getbit(16, 1) | ++---------------+ +| 0 | ++---------------+ + +select getbit(16,4) /* 00010000 */ ++---------------+ +| getbit(16, 4) | ++---------------+ +| 1 | ++---------------+ + +select getbit(16,5) /* 00010000 */ ++---------------+ +| getbit(16, 5) | ++---------------+ +| 0 | ++---------------+ + +select getbit(-1,3); /* 11111111 */ ++---------------+ +| getbit(-1, 3) | ++---------------+ +| 1 | ++---------------+ + +select getbit(-1,25); /* 11111111 */ +ERROR: Invalid bit position: 25 + +select getbit(cast(-1 as int),25); /* 11111111111111111111111111111111 */ ++-----------------------------+ +| getbit(cast(-1 as int), 25) | ++-----------------------------+ +| 1 | ++-----------------------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="rotateleft"> + + <dt> + <codeph>rotateleft(integer_type a, int positions)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">rotateleft() function</indexterm> + <b>Purpose:</b> Rotates an integer value left by a specified number of bits. + As the most significant bit is taken out of the original value, + if it is a 1 bit, it is <q>rotated</q> back to the least significant bit. + Therefore, the final value has the same number of 1 bits as the original value, + just in different positions. + In computer science terms, this operation is a + <q><xref href="https://en.wikipedia.org/wiki/Circular_shift" scope="external" format="html">circular shift</xref></q>. + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + <p> + Specifying a second argument of zero leaves the original value unchanged. + Rotating a -1 value by any number of positions still returns -1, + because the original value has all 1 bits and all the 1 bits are + preserved during rotation. + Similarly, rotating a 0 value by any number of positions still returns 0. + Rotating a value by the same number of bits as in the value returns the same value. + Because this is a circular operation, the number of positions is not limited + to the number of bits in the input value. + For example, rotating an 8-bit value by 1, 9, 17, and so on positions returns an + identical result in each case. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> +<codeblock>select rotateleft(1,4); /* 00000001 -> 00010000 */ ++------------------+ +| rotateleft(1, 4) | ++------------------+ +| 16 | ++------------------+ + +select rotateleft(-1,155); /* 11111111 -> 11111111 */ ++---------------------+ +| rotateleft(-1, 155) | ++---------------------+ +| -1 | ++---------------------+ + +select rotateleft(-128,1); /* 10000000 -> 00000001 */ ++---------------------+ +| rotateleft(-128, 1) | ++---------------------+ +| 1 | ++---------------------+ + +select rotateleft(-127,3); /* 10000001 -> 00001100 */ ++---------------------+ +| rotateleft(-127, 3) | ++---------------------+ +| 12 | ++---------------------+ + +</codeblock> + </dd> + + </dlentry> + + <dlentry id="rotateright"> + + <dt> + <codeph>rotateright(integer_type a, int positions)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">rotateright() function</indexterm> + <b>Purpose:</b> Rotates an integer value right by a specified number of bits. + As the least significant bit is taken out of the original value, + if it is a 1 bit, it is <q>rotated</q> back to the most significant bit. + Therefore, the final value has the same number of 1 bits as the original value, + just in different positions. + In computer science terms, this operation is a + <q><xref href="https://en.wikipedia.org/wiki/Circular_shift" scope="external" format="html">circular shift</xref></q>. + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + <p> + Specifying a second argument of zero leaves the original value unchanged. + Rotating a -1 value by any number of positions still returns -1, + because the original value has all 1 bits and all the 1 bits are + preserved during rotation. + Similarly, rotating a 0 value by any number of positions still returns 0. + Rotating a value by the same number of bits as in the value returns the same value. + Because this is a circular operation, the number of positions is not limited + to the number of bits in the input value. + For example, rotating an 8-bit value by 1, 9, 17, and so on positions returns an + identical result in each case. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> +<codeblock>select rotateright(16,4); /* 00010000 -> 00000001 */ ++--------------------+ +| rotateright(16, 4) | ++--------------------+ +| 1 | ++--------------------+ + +select rotateright(-1,155); /* 11111111 -> 11111111 */ ++----------------------+ +| rotateright(-1, 155) | ++----------------------+ +| -1 | ++----------------------+ + +select rotateright(-128,1); /* 10000000 -> 01000000 */ ++----------------------+ +| rotateright(-128, 1) | ++----------------------+ +| 64 | ++----------------------+ + +select rotateright(-127,3); /* 10000001 -> 00110000 */ ++----------------------+ +| rotateright(-127, 3) | ++----------------------+ +| 48 | ++----------------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="setbit"> + + <dt> + <codeph>setbit(integer_type a, int position [, int zero_or_one])</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">setbit() function</indexterm> + <b>Purpose:</b> By default, changes a bit at a specified position to a 1, if it is not already. + If the optional third argument is set to zero, the specified bit is set to 0 instead. + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + If the bit at the specified position was already 1 (by default) + or 0 (with a third argument of zero), the return value is + the same as the first argument. + The positions are numbered right to left, starting at zero. + (Therefore, the return value could be different from the first argument + even if the position argument is zero.) + The position argument cannot be negative. + <p> + When you use a literal input value, it is treated as an 8-bit, 16-bit, + and so on value, the smallest type that is appropriate. + The type of the input value limits the range of the positions. + Cast the input value to the appropriate type if you need to + ensure it is treated as a 64-bit, 32-bit, and so on value. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> +<codeblock>select setbit(0,0); /* 00000000 -> 00000001 */ ++--------------+ +| setbit(0, 0) | ++--------------+ +| 1 | ++--------------+ + +select setbit(0,3); /* 00000000 -> 00001000 */ ++--------------+ +| setbit(0, 3) | ++--------------+ +| 8 | ++--------------+ + +select setbit(7,3); /* 00000111 -> 00001111 */ ++--------------+ +| setbit(7, 3) | ++--------------+ +| 15 | ++--------------+ + +select setbit(15,3); /* 00001111 -> 00001111 */ ++---------------+ +| setbit(15, 3) | ++---------------+ +| 15 | ++---------------+ + +select setbit(0,32); /* By default, 0 is a TINYINT with only 8 bits. */ +ERROR: Invalid bit position: 32 + +select setbit(cast(0 as bigint),32); /* For BIGINT, the position can be 0..63. */ ++-------------------------------+ +| setbit(cast(0 as bigint), 32) | ++-------------------------------+ +| 4294967296 | ++-------------------------------+ + +select setbit(7,3,1); /* 00000111 -> 00001111; setting to 1 is the default */ ++-----------------+ +| setbit(7, 3, 1) | ++-----------------+ +| 15 | ++-----------------+ + +select setbit(7,2,0); /* 00000111 -> 00000011; third argument of 0 clears instead of sets */ ++-----------------+ +| setbit(7, 2, 0) | ++-----------------+ +| 3 | ++-----------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="shiftleft"> + + <dt> + <codeph>shiftleft(integer_type a, int positions)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">shiftleft() function</indexterm> + <b>Purpose:</b> Shifts an integer value left by a specified number of bits. + As the most significant bit is taken out of the original value, + it is discarded and the least significant bit becomes 0. + In computer science terms, this operation is a <q><xref href="https://en.wikipedia.org/wiki/Logical_shift" scope="external" format="html">logical shift</xref></q>. + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + <p> + The final value has either the same number of 1 bits as the original value, or fewer. + Shifting an 8-bit value by 8 positions, a 16-bit value by 16 positions, and so on produces + a result of zero. + </p> + <p> + Specifying a second argument of zero leaves the original value unchanged. + Shifting any value by 0 returns the original value. + Shifting any value by 1 is the same as multiplying it by 2, + as long as the value is small enough; larger values eventually + become negative when shifted, as the sign bit is set. + Starting with the value 1 and shifting it left by N positions gives + the same result as 2 to the Nth power, or <codeph>pow(2,<varname>N</varname>)</codeph>. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> +<codeblock>select shiftleft(1,0); /* 00000001 -> 00000001 */ ++-----------------+ +| shiftleft(1, 0) | ++-----------------+ +| 1 | ++-----------------+ + +select shiftleft(1,3); /* 00000001 -> 00001000 */ ++-----------------+ +| shiftleft(1, 3) | ++-----------------+ +| 8 | ++-----------------+ + +select shiftleft(8,2); /* 00001000 -> 00100000 */ ++-----------------+ +| shiftleft(8, 2) | ++-----------------+ +| 32 | ++-----------------+ + +select shiftleft(127,1); /* 01111111 -> 11111110 */ ++-------------------+ +| shiftleft(127, 1) | ++-------------------+ +| -2 | ++-------------------+ + +select shiftleft(127,5); /* 01111111 -> 11100000 */ ++-------------------+ +| shiftleft(127, 5) | ++-------------------+ +| -32 | ++-------------------+ + +select shiftleft(-1,4); /* 11111111 -> 11110000 */ ++------------------+ +| shiftleft(-1, 4) | ++------------------+ +| -16 | ++------------------+ +</codeblock> + </dd> + + </dlentry> + + <dlentry id="shiftright"> + + <dt> + <codeph>shiftright(integer_type a, int positions)</codeph> + </dt> + + <dd> + <indexterm audience="Cloudera">shiftright() function</indexterm> + <b>Purpose:</b> Shifts an integer value right by a specified number of bits. + As the least significant bit is taken out of the original value, + it is discarded and the most significant bit becomes 0. + In computer science terms, this operation is a <q><xref href="https://en.wikipedia.org/wiki/Logical_shift" scope="external" format="html">logical shift</xref></q>. + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + <p> + Therefore, the final value has either the same number of 1 bits as the original value, or fewer. + Shifting an 8-bit value by 8 positions, a 16-bit value by 16 positions, and so on produces + a result of zero. + </p> + <p> + Specifying a second argument of zero leaves the original value unchanged. + Shifting any value by 0 returns the original value. + Shifting any positive value right by 1 is the same as dividing it by 2. + Negative values become positive when shifted right. + </p> + <p conref="../shared/impala_common.xml#common/return_type_same"/> + <p conref="../shared/impala_common.xml#common/added_in_230"/> + <p conref="../shared/impala_common.xml#common/example_blurb"/> +<codeblock>select shiftright(16,0); /* 00010000 -> 00000000 */ ++-------------------+ +| shiftright(16, 0) | ++-------------------+ +| 16 | ++-------------------+ + +select shiftright(16,4); /* 00010000 -> 00000000 */ ++-------------------+ +| shiftright(16, 4) | ++-------------------+ +| 1 | ++-------------------+ + +select shiftright(16,5); /* 00010000 -> 00000000 */ ++-------------------+ +| shiftright(16, 5) | ++-------------------+ +| 0 | ++-------------------+ + +select shiftright(-1,1); /* 11111111 -> 01111111 */ ++-------------------+ +| shiftright(-1, 1) | ++-------------------+ +| 127 | ++-------------------+ + +select shiftright(-1,5); /* 11111111 -> 00000111 */ ++-------------------+ +| shiftright(-1, 5) | ++-------------------+ +| 7 | ++-------------------+ +</codeblock> + </dd> + + </dlentry> + + </dl> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_boolean.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_boolean.xml b/docs/topics/impala_boolean.xml new file mode 100644 index 0000000..c788b33 --- /dev/null +++ b/docs/topics/impala_boolean.xml @@ -0,0 +1,154 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="boolean"> + + <title>BOOLEAN Data Type</title> + <titlealts audience="PDF"><navtitle>BOOLEAN</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Data Types"/> + <data name="Category" value="SQL"/> + <data name="Category" value="Data Analysts"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Schemas"/> + </metadata> + </prolog> + + <conbody> + + <p> + A data type used in <codeph>CREATE TABLE</codeph> and <codeph>ALTER TABLE</codeph> statements, representing a + single true/false choice. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + + <p> + In the column definition of a <codeph>CREATE TABLE</codeph> statement: + </p> + +<codeblock><varname>column_name</varname> BOOLEAN</codeblock> + + <p> + <b>Range:</b> <codeph>TRUE</codeph> or <codeph>FALSE</codeph>. Do not use quotation marks around the + <codeph>TRUE</codeph> and <codeph>FALSE</codeph> literal values. You can write the literal values in + uppercase, lowercase, or mixed case. The values queried from a table are always returned in lowercase, + <codeph>true</codeph> or <codeph>false</codeph>. + </p> + + <p> + <b>Conversions:</b> Impala does not automatically convert any other type to <codeph>BOOLEAN</codeph>. All + conversions must use an explicit call to the <codeph>CAST()</codeph> function. + </p> + + <p> + You can use <codeph>CAST()</codeph> to convert +<!-- + <codeph>TINYINT</codeph>, <codeph>SMALLINT</codeph>, + <codeph>INT</codeph>, <codeph>BIGINT</codeph>, <codeph>FLOAT</codeph>, or <codeph>DOUBLE</codeph> +--> + any integer or floating-point type to + <codeph>BOOLEAN</codeph>: a value of 0 represents <codeph>false</codeph>, and any non-zero value is converted + to <codeph>true</codeph>. + </p> + +<codeblock>SELECT CAST(42 AS BOOLEAN) AS nonzero_int, CAST(99.44 AS BOOLEAN) AS nonzero_decimal, + CAST(000 AS BOOLEAN) AS zero_int, CAST(0.0 AS BOOLEAN) AS zero_decimal; ++-------------+-----------------+----------+--------------+ +| nonzero_int | nonzero_decimal | zero_int | zero_decimal | ++-------------+-----------------+----------+--------------+ +| true | true | false | false | ++-------------+-----------------+----------+--------------+ +</codeblock> + + <p> + When you cast the opposite way, from <codeph>BOOLEAN</codeph> to a numeric type, + the result becomes either 1 or 0: + </p> + +<codeblock>SELECT CAST(true AS INT) AS true_int, CAST(true AS DOUBLE) AS true_double, + CAST(false AS INT) AS false_int, CAST(false AS DOUBLE) AS false_double; ++----------+-------------+-----------+--------------+ +| true_int | true_double | false_int | false_double | ++----------+-------------+-----------+--------------+ +| 1 | 1 | 0 | 0 | ++----------+-------------+-----------+--------------+ +</codeblock> + + <p rev="1.4.0"> +<!-- BOOLEAN-to-DECIMAL casting requested in IMPALA-991. As of Sept. 2014, designated "won't fix". --> + You can cast <codeph>DECIMAL</codeph> values to <codeph>BOOLEAN</codeph>, with the same treatment of zero and + non-zero values as the other numeric types. You cannot cast a <codeph>BOOLEAN</codeph> to a + <codeph>DECIMAL</codeph>. + </p> + + <p> + You cannot cast a <codeph>STRING</codeph> value to <codeph>BOOLEAN</codeph>, although you can cast a + <codeph>BOOLEAN</codeph> value to <codeph>STRING</codeph>, returning <codeph>'1'</codeph> for + <codeph>true</codeph> values and <codeph>'0'</codeph> for <codeph>false</codeph> values. + </p> + + <p> + Although you can cast a <codeph>TIMESTAMP</codeph> to a <codeph>BOOLEAN</codeph> or a + <codeph>BOOLEAN</codeph> to a <codeph>TIMESTAMP</codeph>, the results are unlikely to be useful. Any non-zero + <codeph>TIMESTAMP</codeph> (that is, any value other than <codeph>1970-01-01 00:00:00</codeph>) becomes + <codeph>TRUE</codeph> when converted to <codeph>BOOLEAN</codeph>, while <codeph>1970-01-01 00:00:00</codeph> + becomes <codeph>FALSE</codeph>. A value of <codeph>FALSE</codeph> becomes <codeph>1970-01-01 + 00:00:00</codeph> when converted to <codeph>BOOLEAN</codeph>, and <codeph>TRUE</codeph> becomes one second + past this epoch date, that is, <codeph>1970-01-01 00:00:01</codeph>. + </p> + + <p conref="../shared/impala_common.xml#common/null_null_arguments"/> + + <p conref="../shared/impala_common.xml#common/partitioning_blurb"/> + + <p> + Do not use a <codeph>BOOLEAN</codeph> column as a partition key. Although you can create such a table, + subsequent operations produce errors: + </p> + +<codeblock>[localhost:21000] > create table truth_table (assertion string) partitioned by (truth boolean); +[localhost:21000] > insert into truth_table values ('Pigs can fly',false); +ERROR: AnalysisException: INSERT into table with BOOLEAN partition column (truth) is not supported: partitioning.truth_table +</codeblock> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + +<codeblock>SELECT 1 < 2; +SELECT 2 = 5; +SELECT 100 < NULL, 100 > NULL; +CREATE TABLE assertions (claim STRING, really BOOLEAN); +INSERT INTO assertions VALUES + ("1 is less than 2", 1 < 2), + ("2 is the same as 5", 2 = 5), + ("Grass is green", true), + ("The moon is made of green cheese", false); +SELECT claim FROM assertions WHERE really = TRUE; +</codeblock> + + <p conref="../shared/impala_common.xml#common/hbase_ok"/> + + <p conref="../shared/impala_common.xml#common/parquet_ok"/> + + <p conref="../shared/impala_common.xml#common/text_bulky"/> + +<!-- <p conref="../shared/impala_common.xml#common/compatibility_blurb"/> --> + +<!-- <p conref="../shared/impala_common.xml#common/internals_blurb"/> --> + +<!-- <p conref="../shared/impala_common.xml#common/added_in_20"/> --> + + <p conref="../shared/impala_common.xml#common/column_stats_constant"/> + +<!-- <p conref="../shared/impala_common.xml#common/restrictions_blurb"/> --> + +<!-- <p conref="../shared/impala_common.xml#common/related_info"/> --> + + <p> + <b>Related information:</b> <xref href="impala_literals.xml#boolean_literals"/>, + <xref href="impala_operators.xml#operators"/>, + <xref href="impala_conditional_functions.xml#conditional_functions"/> + </p> + </conbody> +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_breakpad.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_breakpad.xml b/docs/topics/impala_breakpad.xml new file mode 100644 index 0000000..f764c2f --- /dev/null +++ b/docs/topics/impala_breakpad.xml @@ -0,0 +1,256 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="breakpad" rev="2.6.0 IMPALA-2686 CDH-40238"> + + <title>Breakpad Minidumps for Impala (<keyword keyref="impala26"/> or higher only)</title> + <titlealts audience="PDF"><navtitle>Breakpad Minidumps</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Troubleshooting"/> + <data name="Category" value="Support"/> + <data name="Category" value="Administrators"/> + </metadata> + </prolog> + + <conbody> + + <p rev="2.6.0 IMPALA-2686 CDH-40238"> + The <xref href="https://chromium.googlesource.com/breakpad/breakpad/" scope="external" format="html">breakpad</xref> + project is an open-source framework for crash reporting. + In <keyword keyref="impala26_full"/> and higher, Impala can use <codeph>breakpad</codeph> to record stack information and + register values when any of the Impala-related daemons crash due to an error such as <codeph>SIGSEGV</codeph> + or unhandled exceptions. + The dump files are much smaller than traditional core dump files. The dump mechanism itself uses very little + memory, which improves reliability if the crash occurs while the system is low on memory. + </p> + + <note type="important"> + Because of the internal mechanisms involving Impala memory allocation and Linux + signalling for out-of-memory (OOM) errors, if an Impala-related daemon experiences a + crash due to an OOM condition, it does <i>not</i> generate a minidump for that error. + <p> + + </p> + </note> + + + <p outputclass="toc inpage" audience="PDF"/> + + </conbody> + + <concept id="breakpad_minidump_enable"> + <title>Enabling or Disabling Minidump Generation</title> + <conbody> + <p> + By default, a minidump file is generated when an Impala-related daemon crashes. + To turn off generation of the minidump files, change the + <uicontrol>minidump_path</uicontrol> configuration setting of one or more Impala-related daemons + to the empty string, and restart the corresponding services or daemons. + </p> + + <p rev="IMPALA-3677 CDH-43745"> + In <keyword keyref="impala27_full"/> and higher, + you can send a <codeph>SIGUSR1</codeph> signal to any Impala-related daemon to write a + Breakpad minidump. For advanced troubleshooting, you can now produce a minidump + without triggering a crash. + </p> + </conbody> + </concept> + + <concept id="breakpad_minidump_location" rev="IMPALA-3581"> + <title>Specifying the Location for Minidump Files</title> + <conbody> + <p> + By default, all minidump files are written to the following location + on the host where a crash occurs: + <!-- Location stated in IMPALA-3581; overridden by different location from IMPALA-2686? + <filepath><varname>log_directory</varname>/minidumps/<varname>daemon_name</varname></filepath> --> + <ul> + <li> + <p> + Clusters managed by Cloudera Manager: <filepath>/var/log/impala-minidumps/<varname>daemon_name</varname></filepath> + </p> + </li> + <li> + <p> + Clusters not managed by Cloudera Manager: + <filepath><varname>impala_log_dir</varname>/<varname>daemon_name</varname>/minidumps/<varname>daemon_name</varname></filepath> + </p> + </li> + </ul> + The minidump files for <cmdname>impalad</cmdname>, <cmdname>catalogd</cmdname>, + and <cmdname>statestored</cmdname> are each written to a separate directory. + </p> + <p> + To specify a different location, set the + <!-- Again, IMPALA-3581 says one thing and IMPALA-2686 / observation of CM interface says another. + <codeph>log_dir</codeph> --> + <uicontrol>minidump_path</uicontrol> + configuration setting of one or more Impala-related daemons, and restart the corresponding services or daemons. + </p> + <p> + If you specify a relative path for this setting, the value is interpreted relative to + the default <uicontrol>minidump_path</uicontrol> directory. + </p> + </conbody> + </concept> + + <concept id="breakpad_minidump_number"> + <title>Controlling the Number of Minidump Files</title> + <conbody> + <p> + Like any files used for logging or troubleshooting, consider limiting the number of + minidump files, or removing unneeded ones, depending on the amount of free storage + space on the hosts in the cluster. + </p> + <p> + Because the minidump files are only used for problem resolution, you can remove any such files that + are not needed to debug current issues. + </p> + <p> + To control how many minidump files Impala keeps around at any one time, + set the <uicontrol>max_minidumps</uicontrol> configuration setting for + of one or more Impala-related daemon, and restart the corresponding services or daemons. + The default for this setting is 9. A zero or negative value is interpreted as + <q>unlimited</q>. + </p> + </conbody> + </concept> + + <concept id="breakpad_minidump_logging"> + <title>Detecting Crash Events</title> + <conbody> + <p> + You can see in the Impala log files or in the Cloudera Manager charts for Impala + when crash events occur that generate minidump files. Because each restart begins + a new log file, the <q>crashed</q> message is always at or near the bottom of the + log file. (There might be another later message if core dumps are also enabled.) + </p> + </conbody> + </concept> + + <concept id="breakpad_support_process" rev="CDH-39818"> + <title>Using the Minidump Files for Problem Resolution</title> + <conbody> + <p> + Typically, you provide minidump files to <keyword keyref="support_org"/> as part of problem resolution, + in the same way that you might provide a core dump. The <uicontrol>Send Diagnostic Data</uicontrol> + under the <uicontrol>Support</uicontrol> menu in Cloudera Manager guides you through the + process of selecting a time period and volume of diagnostic data, then collects the data + from all hosts and transmits the relevant information for you. + </p> + <fig id="fig_pqw_gvx_pr"> + <title>Send Diagnostic Data choice under Support menu</title> + <image href="../images/support_send_diagnostic_data.png" scalefit="yes" placement="break"/> + </fig> + <p> + You might get additional instructions from <keyword keyref="support_org"/> about collecting minidumps to better isolate a specific problem. + Because the information in the minidump files is limited to stack traces and register contents, + the possibility of including sensitive information is much lower than with core dump files. + If any sensitive information is included in the minidump, <keyword keyref="support_org"/> preserves the confidentiality of that information. + </p> + </conbody> + </concept> + + <concept id="breakpad_demo"> + <title>Demonstration of Breakpad Feature</title> + <conbody> + <p> + The following example uses the command <cmdname>kill -11</cmdname> to + simulate a <codeph>SIGSEGV</codeph> crash for an <cmdname>impalad</cmdname> + process on a single DataNode, then examines the relevant log files and minidump file. + </p> + <p> + First, as root on a worker node, we kill the <cmdname>impalad</cmdname> process with a + <codeph>SIGSEGV</codeph> error. The original process ID was 23114. (Cloudera Manager + restarts the process with a new pid, as shown by the second <cmdname>ps</cmdname> command.) + </p> +<codeblock><![CDATA[ +# ps ax | grep impalad +23114 ? Sl 0:18 /opt/cloudera/parcels/<parcel_version>/lib/impala/sbin-retail/impalad --flagfile=/var/run/cloudera-scm-agent/process/114-impala-IMPALAD/impala-conf/impalad_flags +31259 pts/0 S+ 0:00 grep impalad +# +# kill -11 23114 +# +# ps ax | grep impalad +31374 ? Rl 0:04 /opt/cloudera/parcels/<parcel_version>/lib/impala/sbin-retail/impalad --flagfile=/var/run/cloudera-scm-agent/process/114-impala-IMPALAD/impala-conf/impalad_flags +31475 pts/0 S+ 0:00 grep impalad +]]> +</codeblock> + + <p> + We locate the log directory underneath <filepath>/var/log</filepath>. + There is a <codeph>.INFO</codeph>, <codeph>.WARNING</codeph>, and <codeph>.ERROR</codeph> + log file for the 23114 process ID. The minidump message is written to the + <codeph>.INFO</codeph> file and the <codeph>.ERROR</codeph> file, but not the + <codeph>.WARNING</codeph> file. In this case, a large core file was also produced. + </p> +<codeblock><![CDATA[ +# cd /var/log/impalad +# ls -la | grep 23114 +-rw------- 1 impala impala 3539079168 Jun 23 15:20 core.23114 +-rw-r--r-- 1 impala impala 99057 Jun 23 15:20 hs_err_pid23114.log +-rw-r--r-- 1 impala impala 351 Jun 23 15:20 impalad.worker_node_123.impala.log.ERROR.20160623-140343.23114 +-rw-r--r-- 1 impala impala 29101 Jun 23 15:20 impalad.worker_node_123.impala.log.INFO.20160623-140343.23114 +-rw-r--r-- 1 impala impala 228 Jun 23 14:03 impalad.worker_node_123.impala.log.WARNING.20160623-140343.23114 +]]> +</codeblock> + <p> + The <codeph>.INFO</codeph> log includes the location of the minidump file, followed by + a report of a core dump. With the breakpad minidump feature enabled, now we might + disable core dumps or keep fewer of them around. + </p> +<codeblock><![CDATA[ +# cat impalad.worker_node_123.impala.log.INFO.20160623-140343.23114 +... +Wrote minidump to /var/log/impala-minidumps/impalad/0980da2d-a905-01e1-25ff883a-04ee027a.dmp +# +# A fatal error has been detected by the Java Runtime Environment: +# +# SIGSEGV (0xb) at pc=0x00000030c0e0b68a, pid=23114, tid=139869541455968 +# +# JRE version: Java(TM) SE Runtime Environment (7.0_67-b01) (build 1.7.0_67-b01) +# Java VM: Java HotSpot(TM) 64-Bit Server VM (24.65-b04 mixed mode linux-amd64 compressed oops) +# Problematic frame: +# C [libpthread.so.0+0xb68a] pthread_cond_wait+0xca +# +# Core dump written. Default location: /var/log/impalad/core or core.23114 +# +# An error report file with more information is saved as: +# /var/log/impalad/hs_err_pid23114.log +# +# If you would like to submit a bug report, please visit: +# http://bugreport.sun.com/bugreport/crash.jsp +# The crash happened outside the Java Virtual Machine in native code. +# See problematic frame for where to report the bug. +... + +# cat impalad.worker_node_123.impala.log.ERROR.20160623-140343.23114 + +Log file created at: 2016/06/23 14:03:43 +Running on machine:.worker_node_123 +Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg +E0623 14:03:43.911002 23114 logging.cc:118] stderr will be logged to this file. +Wrote minidump to /var/log/impala-minidumps/impalad/0980da2d-a905-01e1-25ff883a-04ee027a.dmp +]]> +</codeblock> + <p> + The resulting minidump file is much smaller than the corresponding core file, + making it much easier to supply diagnostic information to <keyword keyref="support_org"/>. + The transmission process for the minidump files is automated through Cloudera Manager. + </p> +<codeblock><![CDATA[ +# pwd +/var/log/impalad +# cd ../impala-minidumps/impalad +# ls +0980da2d-a905-01e1-25ff883a-04ee027a.dmp +# du -kh * +2.4M 0980da2d-a905-01e1-25ff883a-04ee027a.dmp +]]> +</codeblock> + </conbody> + </concept> + +</concept> http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_cdh.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_cdh.xml b/docs/topics/impala_cdh.xml new file mode 100644 index 0000000..7ecba74 --- /dev/null +++ b/docs/topics/impala_cdh.xml @@ -0,0 +1,25 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="impala_cdh"> + + <title>How Impala Works with CDH</title> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Concepts"/> + <data name="Category" value="CDH"/> + <data name="Category" value="Administrators"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p conref="../shared/impala_common.xml#common/impala_overview_diagram"/> + + <p conref="../shared/impala_common.xml#common/component_list"/> + + <p conref="../shared/impala_common.xml#common/query_overview"/> + </conbody> +</concept>
