http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_alter_table.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_alter_table.xml b/docs/topics/impala_alter_table.xml new file mode 100644 index 0000000..61199cf --- /dev/null +++ b/docs/topics/impala_alter_table.xml @@ -0,0 +1,806 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="alter_table"> + + <title>ALTER TABLE Statement</title> + <titlealts audience="PDF"><navtitle>ALTER TABLE</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Data Types"/> + <data name="Category" value="SQL"/> + <data name="Category" value="DDL"/> + <data name="Category" value="HDFS Caching"/> + <data name="Category" value="Tables"/> + <data name="Category" value="Schemas"/> + <data name="Category" value="S3"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">ALTER TABLE statement</indexterm> + The <codeph>ALTER TABLE</codeph> statement changes the structure or properties of an existing Impala table. + </p> + <p> + In Impala, this is primarily a logical operation that updates the table metadata in the metastore database that Impala + shares with Hive. Most <codeph>ALTER TABLE</codeph> operations do not actually rewrite, move, and so on the actual data + files. (The <codeph>RENAME TO</codeph> clause is the one exception; it can cause HDFS files to be moved to different paths.) + When you do an <codeph>ALTER TABLE</codeph> operation, you typically need to perform corresponding physical filesystem operations, + such as rewriting the data files to include extra fields, or converting them to a different file format. + </p> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>ALTER TABLE [<varname>old_db_name</varname>.]<varname>old_table_name</varname> RENAME TO [<varname>new_db_name</varname>.]<varname>new_table_name</varname> + +ALTER TABLE <varname>name</varname> ADD COLUMNS (<varname>col_spec</varname>[, <varname>col_spec</varname> ...]) +ALTER TABLE <varname>name</varname> DROP [COLUMN] <varname>column_name</varname> +ALTER TABLE <varname>name</varname> CHANGE <varname>column_name</varname> <varname>new_name</varname> <varname>new_type</varname> +ALTER TABLE <varname>name</varname> REPLACE COLUMNS (<varname>col_spec</varname>[, <varname>col_spec</varname> ...]) + +ALTER TABLE <varname>name</varname> { ADD [IF NOT EXISTS] | DROP [IF EXISTS] } PARTITION (<varname>partition_spec</varname>) <ph rev="2.3.0">[PURGE]</ph> +<ph rev="2.3.0 IMPALA-1568 CDH-36799">ALTER TABLE <varname>name</varname> RECOVER PARTITIONS</ph> + +ALTER TABLE <varname>name</varname> [PARTITION (<varname>partition_spec</varname>)] + SET { FILEFORMAT <varname>file_format</varname> + | LOCATION '<varname>hdfs_path_of_directory</varname>' + | TBLPROPERTIES (<varname>table_properties</varname>) + | SERDEPROPERTIES (<varname>serde_properties</varname>) } + +<ph rev="2.6.0 IMPALA-3369">ALTER TABLE <varname>name</varname> <varname>colname</varname> + ('<varname>statsKey</varname>'='<varname>val</varname>, ...) + +statsKey ::= numDVs | numNulls | avgSize | maxSize</ph> + +<ph rev="1.4.0">ALTER TABLE <varname>name</varname> [PARTITION (<varname>partition_spec</varname>)] SET { CACHED IN '<varname>pool_name</varname>' <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED }</ph> + +<varname>new_name</varname> ::= [<varname>new_database</varname>.]<varname>new_table_name</varname> + +<varname>col_spec</varname> ::= <varname>col_name</varname> <varname>type_name</varname> + +<varname>partition_spec</varname> ::= <varname>partition_col</varname>=<varname>constant_value</varname> + +<varname>table_properties</varname> ::= '<varname>name</varname>'='<varname>value</varname>'[, '<varname>name</varname>'='<varname>value</varname>' ...] + +<varname>serde_properties</varname> ::= '<varname>name</varname>'='<varname>value</varname>'[, '<varname>name</varname>'='<varname>value</varname>' ...] + +<varname>file_format</varname> ::= { PARQUET | TEXTFILE | RCFILE | SEQUENCEFILE | AVRO } +</codeblock> + + <p conref="../shared/impala_common.xml#common/ddl_blurb"/> + + <p conref="../shared/impala_common.xml#common/complex_types_blurb"/> + + <p rev="2.3.0"> + In <keyword keyref="impala23_full"/> and higher, the <codeph>ALTER TABLE</codeph> statement can + change the metadata for tables containing complex types (<codeph>ARRAY</codeph>, + <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>). + For example, you can use an <codeph>ADD COLUMNS</codeph>, <codeph>DROP COLUMN</codeph>, or <codeph>CHANGE</codeph> + clause to modify the table layout for complex type columns. + Although Impala queries only work for complex type columns in Parquet tables, the complex type support in the + <codeph>ALTER TABLE</codeph> statement applies to all file formats. + For example, you can use Impala to update metadata for a staging table in a non-Parquet file format where the + data is populated by Hive. Or you can use <codeph>ALTER TABLE SET FILEFORMAT</codeph> to change the format + of an existing table to Parquet so that Impala can query it. Remember that changing the file format for a table does + not convert the data files within the table; you must prepare any Parquet data files containing complex types + outside Impala, and bring them into the table using <codeph>LOAD DATA</codeph> or updating the table's + <codeph>LOCATION</codeph> property. + See <xref href="impala_complex_types.xml#complex_types"/> for details about using complex types. + </p> + + <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/> + + <p> + Whenever you specify partitions in an <codeph>ALTER TABLE</codeph> statement, through the <codeph>PARTITION + (<varname>partition_spec</varname>)</codeph> clause, you must include all the partitioning columns in the + specification. + </p> + + <p> + Most of the <codeph>ALTER TABLE</codeph> operations work the same for internal tables (managed by Impala) as + for external tables (with data files located in arbitrary locations). The exception is renaming a table; for + an external table, the underlying data directory is not renamed or moved. + </p> + + <p conref="../shared/impala_common.xml#common/s3_blurb"/> + + <p rev="2.6.0 CDH-39913 IMPALA-1878"> + You can specify an <codeph>s3a://</codeph> prefix on the <codeph>LOCATION</codeph> attribute of a table or partition + to make Impala query data from the Amazon S3 filesystem. In <keyword keyref="impala26_full"/> and higher, Impala automatically + handles creating or removing the associated folders when you issue <codeph>ALTER TABLE</codeph> statements + with the <codeph>ADD PARTITION</codeph> or <codeph>DROP PARTITION</codeph> clauses. + </p> + + <p conref="../shared/impala_common.xml#common/s3_ddl"/> + + <p rev="1.4.0"> + <b>HDFS caching (CACHED IN clause):</b> + </p> + + <p rev="1.4.0"> + If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in the table + directory or the partition subdirectories are designated to be loaded into memory with the HDFS caching + mechanism. See <xref href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using the HDFS + caching feature. + </p> + + <p conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/> + + <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/> + + <p> + The following sections show examples of the use cases for various <codeph>ALTER TABLE</codeph> clauses. + </p> + + <p> + <b>To rename a table (RENAME TO clause):</b> + </p> + +<!-- Beefing up the syntax in its original location up to, don't need to repeat it here. +<codeblock>ALTER TABLE <varname>old_name</varname> RENAME TO <varname>new_name</varname>;</codeblock> +--> + + <p> + The <codeph>RENAME TO</codeph> clause lets you change the name of an existing table, and optionally which + database it is located in. + </p> + + <p> + For internal tables, this operation physically renames the directory within HDFS that contains the data files; + the original directory name no longer exists. By qualifying the table names with database names, you can use + this technique to move an internal table (and its associated data directory) from one database to another. + For example: + </p> + +<codeblock>create database d1; +create database d2; +create database d3; +use d1; +create table mobile (x int); +use d2; +-- Move table from another database to the current one. +alter table d1.mobile rename to mobile; +use d1; +-- Move table from one database to another. +alter table d2.mobile rename to d3.mobile;</codeblock> + + <p> + For external tables, + </p> + + <p> + <b>To change the physical location where Impala looks for data files associated with a table or + partition:</b> + </p> + +<codeblock>ALTER TABLE <varname>table_name</varname> [PARTITION (<varname>partition_spec</varname>)] SET LOCATION '<varname>hdfs_path_of_directory</varname>';</codeblock> + + <p> + The path you specify is the full HDFS path where the data files reside, or will be created. Impala does not + create any additional subdirectory named after the table. Impala does not move any data files to this new + location or change any data files that might already exist in that directory. + </p> + + <p> + To set the location for a single partition, include the <codeph>PARTITION</codeph> clause. Specify all the + same partitioning columns for the table, with a constant value for each, to precisely identify the single + partition affected by the statement: + </p> + +<codeblock>create table p1 (s string) partitioned by (month int, day int); +-- Each ADD PARTITION clause creates a subdirectory in HDFS. +alter table p1 add partition (month=1, day=1); +alter table p1 add partition (month=1, day=2); +alter table p1 add partition (month=2, day=1); +alter table p1 add partition (month=2, day=2); +-- Redirect queries, INSERT, and LOAD DATA for one partition +-- to a specific different directory. +alter table p1 partition (month=1, day=1) set location '/usr/external_data/new_years_day'; +</codeblock> + + <note conref="../shared/impala_common.xml#common/add_partition_set_location"/> + + <p rev="2.3.0 IMPALA-1568 CDH-36799"> + <b>To automatically detect new partition directories added through Hive or HDFS operations:</b> + </p> + + <p rev="2.3.0 IMPALA-1568 CDH-36799"> + In <keyword keyref="impala23_full"/> and higher, the <codeph>RECOVER PARTITIONS</codeph> clause scans + a partitioned table to detect if any new partition directories were added outside of Impala, + such as by Hive <codeph>ALTER TABLE</codeph> statements or by <cmdname>hdfs dfs</cmdname> + or <cmdname>hadoop fs</cmdname> commands. The <codeph>RECOVER PARTITIONS</codeph> clause + automatically recognizes any data files present in these new directories, the same as + the <codeph>REFRESH</codeph> statement does. + </p> + + <p rev="2.3.0 IMPALA-1568 CDH-36799"> + For example, here is a sequence of examples showing how you might create a partitioned table in Impala, + create new partitions through Hive, copy data files into the new partitions with the <cmdname>hdfs</cmdname> + command, and have Impala recognize the new partitions and new data: + </p> + + <p rev="2.3.0 IMPALA-1568 CDH-36799"> + In Impala, create the table, and a single partition for demonstration purposes: + </p> + +<codeblock rev="2.3.0 IMPALA-1568 CDH-36799"> +<![CDATA[ +create database recover_partitions; +use recover_partitions; +create table t1 (s string) partitioned by (yy int, mm int); +insert into t1 partition (yy = 2016, mm = 1) values ('Partition exists'); +show files in t1; ++---------------------------------------------------------------------+------+--------------+ +| Path | Size | Partition | ++---------------------------------------------------------------------+------+--------------+ +| /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1/data.txt | 17B | yy=2016/mm=1 | ++---------------------------------------------------------------------+------+--------------+ +quit; +]]> +</codeblock> + + <p rev="2.3.0 IMPALA-1568 CDH-36799"> + In Hive, create some new partitions. In a real use case, you might create the + partitions and populate them with data as the final stages of an ETL pipeline. + </p> + +<codeblock rev="2.3.0 IMPALA-1568 CDH-36799"> +<![CDATA[ +hive> use recover_partitions; +OK +hive> alter table t1 add partition (yy = 2016, mm = 2); +OK +hive> alter table t1 add partition (yy = 2016, mm = 3); +OK +hive> quit; +]]> +</codeblock> + + <p rev="2.3.0 IMPALA-1568 CDH-36799"> + For demonstration purposes, manually copy data (a single row) into these + new partitions, using manual HDFS operations: + </p> + +<codeblock rev="2.3.0 IMPALA-1568 CDH-36799"> +<![CDATA[ +$ hdfs dfs -ls /user/hive/warehouse/recover_partitions.db/t1/yy=2016/ +Found 3 items +drwxr-xr-x - impala hive 0 2016-05-09 16:06 /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1 +drwxr-xr-x - jrussell hive 0 2016-05-09 16:14 /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=2 +drwxr-xr-x - jrussell hive 0 2016-05-09 16:13 /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=3 + +$ hdfs dfs -cp /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1/data.txt \ + /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=2/data.txt +$ hdfs dfs -cp /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1/data.txt \ + /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=3/data.txt +]]> +</codeblock> + +<codeblock rev="2.3.0 IMPALA-1568 CDH-36799"> +<![CDATA[ +hive> select * from t1; +OK +Partition exists 2016 1 +Partition exists 2016 2 +Partition exists 2016 3 +hive> quit; +]]> +</codeblock> + + <p rev="2.3.0 IMPALA-1568 CDH-36799"> + In Impala, initially the partitions and data are not visible. + Running <codeph>ALTER TABLE</codeph> with the <codeph>RECOVER PARTITIONS</codeph> + clause scans the table data directory to find any new partition directories, and + the data files inside them: + </p> + +<codeblock rev="2.3.0 IMPALA-1568 CDH-36799"> +<![CDATA[ +select * from t1; ++------------------+------+----+ +| s | yy | mm | ++------------------+------+----+ +| Partition exists | 2016 | 1 | ++------------------+------+----+ + +alter table t1 recover partitions; +select * from t1; ++------------------+------+----+ +| s | yy | mm | ++------------------+------+----+ +| Partition exists | 2016 | 1 | +| Partition exists | 2016 | 3 | +| Partition exists | 2016 | 2 | ++------------------+------+----+ +]]> +</codeblock> + + <p rev="1.2"> + <b>To change the key-value pairs of the TBLPROPERTIES and SERDEPROPERTIES fields:</b> + </p> + +<codeblock>ALTER TABLE <varname>table_name</varname> SET TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>'[, ...]); +ALTER TABLE <varname>table_name</varname> SET SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>'[, ...]);</codeblock> + + <p> + The <codeph>TBLPROPERTIES</codeph> clause is primarily a way to associate arbitrary user-specified data items + with a particular table. + </p> + + <p> + The <codeph>SERDEPROPERTIES</codeph> clause sets up metadata defining how tables are read or written, needed + in some cases by Hive but not used extensively by Impala. You would use this clause primarily to change the + delimiter in an existing text table or partition, by setting the <codeph>'serialization.format'</codeph> and + <codeph>'field.delim'</codeph> property values to the new delimiter character: + </p> + +<codeblock>-- This table begins life as pipe-separated text format. +create table change_to_csv (s1 string, s2 string) row format delimited fields terminated by '|'; +-- Then we change it to a CSV table. +alter table change_to_csv set SERDEPROPERTIES ('serialization.format'=',', 'field.delim'=','); +insert overwrite change_to_csv values ('stop','go'), ('yes','no'); +!hdfs dfs -cat 'hdfs://<varname>hostname</varname>:8020/<varname>data_directory</varname>/<varname>dbname</varname>.db/change_to_csv/<varname>data_file</varname>'; +stop,go +yes,no</codeblock> + + <p> + Use the <codeph>DESCRIBE FORMATTED</codeph> statement to see the current values of these properties for an + existing table. See <xref href="impala_create_table.xml#create_table"/> for more details about these clauses. + See <xref href="impala_perf_stats.xml#perf_table_stats_manual"/> for an example of using table properties to + fine-tune the performance-related table statistics. + </p> + + <p> + <b>To manually set or update table or column statistics:</b> + </p> + + <p> + Although for most tables the <codeph>COMPUTE STATS</codeph> or <codeph>COMPUTE INCREMENTAL STATS</codeph> + statement is all you need to keep table and column statistics up to date for a table, + sometimes for a very large table or one that is updated frequently, the length of time to recompute + all the statistics might make it impractical to run those statements as often as needed. + As a workaround, you can use the <codeph>ALTER TABLE</codeph> statement to set table statistics + at the level of the entire table or a single partition, or column statistics at the level of + the entire table. + </p> + + <p> + You can set the <codeph>numrows</codeph> value for table statistics by changing the + <codeph>TBLPROPERTIES</codeph> setting for a table or partition. + For example: +<codeblock conref="../shared/impala_common.xml#common/set_numrows_example"/> +<codeblock conref="../shared/impala_common.xml#common/set_numrows_partitioned_example"/> + See <xref href="impala_perf_stats.xml#perf_table_stats_manual"/> for details. + </p> + + <p rev="2.6.0 IMPALA-3369"> + In <keyword keyref="impala26_full"/> and higher, you can use the <codeph>SET COLUMN STATS</codeph> clause + to set a specific stats value for a particular column. + </p> + + <p conref="../shared/impala_common.xml#common/set_column_stats_example"/> + + <p> + <b>To reorganize columns for a table:</b> + </p> + +<codeblock>ALTER TABLE <varname>table_name</varname> ADD COLUMNS (<varname>column_defs</varname>); +ALTER TABLE <varname>table_name</varname> REPLACE COLUMNS (<varname>column_defs</varname>); +ALTER TABLE <varname>table_name</varname> CHANGE <varname>column_name</varname> <varname>new_name</varname> <varname>new_type</varname>; +ALTER TABLE <varname>table_name</varname> DROP <varname>column_name</varname>;</codeblock> + + <p> + The <varname>column_spec</varname> is the same as in the <codeph>CREATE TABLE</codeph> statement: the column + name, then its data type, then an optional comment. You can add multiple columns at a time. The parentheses + are required whether you add a single column or multiple columns. When you replace columns, all the original + column definitions are discarded. You might use this technique if you receive a new set of data files with + different data types or columns in a different order. (The data files are retained, so if the new columns are + incompatible with the old ones, use <codeph>INSERT OVERWRITE</codeph> or <codeph>LOAD DATA OVERWRITE</codeph> + to replace all the data before issuing any further queries.) + </p> + + <p rev="CDH-37178"> + For example, here is how you might add columns to an existing table. + The first <codeph>ALTER TABLE</codeph> adds two new columns, and the second + <codeph>ALTER TABLE</codeph> adds one new column. + A single Impala query reads both the old and new data files, containing different numbers of columns. + For any columns not present in a particular data file, all the column values are + considered to be <codeph>NULL</codeph>. + </p> + +<codeblock rev="CDH-37178"> +create table t1 (x int); +insert into t1 values (1), (2); + +alter table t1 add columns (s string, t timestamp); +insert into t1 values (3, 'three', now()); + +alter table t1 add columns (b boolean); +insert into t1 values (4, 'four', now(), true); + +select * from t1 order by x; ++---+-------+-------------------------------+------+ +| x | s | t | b | ++---+-------+-------------------------------+------+ +| 1 | NULL | NULL | NULL | +| 2 | NULL | NULL | NULL | +| 3 | three | 2016-05-11 11:19:45.054457000 | NULL | +| 4 | four | 2016-05-11 11:20:20.260733000 | true | ++---+-------+-------------------------------+------+ +</codeblock> + + <p> + You might use the <codeph>CHANGE</codeph> clause to rename a single column, or to treat an existing column as + a different type than before, such as to switch between treating a column as <codeph>STRING</codeph> and + <codeph>TIMESTAMP</codeph>, or between <codeph>INT</codeph> and <codeph>BIGINT</codeph>. You can only drop a + single column at a time; to drop multiple columns, issue multiple <codeph>ALTER TABLE</codeph> statements, or + define the new set of columns with a single <codeph>ALTER TABLE ... REPLACE COLUMNS</codeph> statement. + </p> + + <p rev="CDH-37178"> + The following examples show some safe operations to drop or change columns. Dropping the final column + in a table lets Impala ignore the data causing any disruption to existing data files. Changing the type + of a column works if existing data values can be safely converted to the new type. The type conversion + rules depend on the file format of the underlying table. For example, in a text table, the same value + can be interpreted as a <codeph>STRING</codeph> or a numeric value, while in a binary format such as + Parquet, the rules are stricter and type conversions only work between certain sizes of integers. + </p> + +<codeblock rev="CDH-37178"> +create table optional_columns (x int, y int, z int, a1 int, a2 int); +insert into optional_columns values (1,2,3,0,0), (2,3,4,100,100); + +-- When the last column in the table is dropped, Impala ignores the +-- values that are no longer needed. (Dropping A1 but leaving A2 +-- would cause problems, as we will see in a subsequent example.) +alter table optional_columns drop column a2; +alter table optional_columns drop column a1; + +select * from optional_columns; ++---+---+---+ +| x | y | z | ++---+---+---+ +| 1 | 2 | 3 | +| 2 | 3 | 4 | ++---+---+---+ +</codeblock> + +<codeblock rev="CDH-37178"> +create table int_to_string (s string, x int); +insert into int_to_string values ('one', 1), ('two', 2); + +-- What was an INT column will now be interpreted as STRING. +-- This technique works for text tables but not other file formats. +-- The second X represents the new name of the column, which we keep the same. +alter table int_to_string change x x string; + +-- Once the type is changed, we can insert non-integer values into the X column +-- and treat that column as a string, for example by uppercasing or concatenating. +insert into int_to_string values ('three', 'trois'); +select s, upper(x) from int_to_string; ++-------+----------+ +| s | upper(x) | ++-------+----------+ +| one | 1 | +| two | 2 | +| three | TROIS | ++-------+----------+ +</codeblock> + + <p rev="CDH-37178"> + Remember that Impala does not actually do any conversion for the underlying data files as a result of + <codeph>ALTER TABLE</codeph> statements. If you use <codeph>ALTER TABLE</codeph> to create a table + layout that does not agree with the contents of the underlying files, you must replace the files + yourself, such as using <codeph>LOAD DATA</codeph> to load a new set of data files, or + <codeph>INSERT OVERWRITE</codeph> to copy from another table and replace the original data. + </p> + + <p rev="CDH-37178"> + The following example shows what happens if you delete the middle column from a Parquet table containing three columns. + The underlying data files still contain three columns of data. Because the columns are interpreted based on their positions in + the data file instead of the specific column names, a <codeph>SELECT *</codeph> query now reads the first and second + columns from the data file, potentially leading to unexpected results or conversion errors. + For this reason, if you expect to someday drop a column, declare it as the last column in the table, where its data + can be ignored by queries after the column is dropped. Or, re-run your ETL process and create new data files + if you drop or change the type of a column in a way that causes problems with existing data files. + </p> + +<codeblock rev="CDH-37178"> +-- Parquet table showing how dropping a column can produce unexpected results. +create table p1 (s1 string, s2 string, s3 string) stored as parquet; + +insert into p1 values ('one', 'un', 'uno'), ('two', 'deux', 'dos'), + ('three', 'trois', 'tres'); +select * from p1; ++-------+-------+------+ +| s1 | s2 | s3 | ++-------+-------+------+ +| one | un | uno | +| two | deux | dos | +| three | trois | tres | ++-------+-------+------+ + +alter table p1 drop column s2; +-- The S3 column contains unexpected results. +-- Because S2 and S3 have compatible types, the query reads +-- values from the dropped S2, because the existing data files +-- still contain those values as the second column. +select * from p1; ++-------+-------+ +| s1 | s3 | ++-------+-------+ +| one | un | +| two | deux | +| three | trois | ++-------+-------+ +</codeblock> + +<codeblock rev="CDH-37178"> +-- Parquet table showing how dropping a column can produce conversion errors. +create table p2 (s1 string, x int, s3 string) stored as parquet; + +insert into p2 values ('one', 1, 'uno'), ('two', 2, 'dos'), ('three', 3, 'tres'); +select * from p2; ++-------+---+------+ +| s1 | x | s3 | ++-------+---+------+ +| one | 1 | uno | +| two | 2 | dos | +| three | 3 | tres | ++-------+---+------+ + +alter table p2 drop column x; +select * from p2; +WARNINGS: +File '<varname>hdfs_filename</varname>' has an incompatible Parquet schema for column 'add_columns.p2.s3'. +Column type: STRING, Parquet schema: +optional int32 x [i:1 d:1 r:0] + +File '<varname>hdfs_filename</varname>' has an incompatible Parquet schema for column 'add_columns.p2.s3'. +Column type: STRING, Parquet schema: +optional int32 x [i:1 d:1 r:0] +</codeblock> + + <p rev="IMPALA-3092"> + In <keyword keyref="impala26_full"/> and higher, if an Avro table is created without column definitions in the + <codeph>CREATE TABLE</codeph> statement, and columns are later + added through <codeph>ALTER TABLE</codeph>, the resulting + table is now queryable. Missing values from the newly added + columns now default to <codeph>NULL</codeph>. + </p> + + <p> + <b>To change the file format that Impala expects data to be in, for a table or partition:</b> + </p> + + <p> + Use an <codeph>ALTER TABLE ... SET FILEFORMAT</codeph> clause. You can include an optional <codeph>PARTITION + (<varname>col1</varname>=<varname>val1</varname>, <varname>col2</varname>=<varname>val2</varname>, + ...</codeph> clause so that the file format is changed for a specific partition rather than the entire table. + </p> + + <p> + Because this operation only changes the table metadata, you must do any conversion of existing data using + regular Hadoop techniques outside of Impala. Any new data created by the Impala <codeph>INSERT</codeph> + statement will be in the new format. You cannot specify the delimiter for Text files; the data files must be + comma-delimited. +<!-- Although Impala can read Avro tables + created through Hive, you cannot specify the Avro file format in an Impala + <codeph>ALTER TABLE</codeph> statement. --> + </p> + + <p> + To set the file format for a single partition, include the <codeph>PARTITION</codeph> clause. Specify all the + same partitioning columns for the table, with a constant value for each, to precisely identify the single + partition affected by the statement: + </p> + +<codeblock>create table p1 (s string) partitioned by (month int, day int); +-- Each ADD PARTITION clause creates a subdirectory in HDFS. +alter table p1 add partition (month=1, day=1); +alter table p1 add partition (month=1, day=2); +alter table p1 add partition (month=2, day=1); +alter table p1 add partition (month=2, day=2); +-- Queries and INSERT statements will read and write files +-- in this format for this specific partition. +alter table p1 partition (month=2, day=2) set fileformat parquet; +</codeblock> + + <p> + <b>To add or drop partitions for a table</b>, the table must already be partitioned (that is, created with a + <codeph>PARTITIONED BY</codeph> clause). The partition is a physical directory in HDFS, with a name that + encodes a particular column value (the <b>partition key</b>). The Impala <codeph>INSERT</codeph> statement + already creates the partition if necessary, so the <codeph>ALTER TABLE ... ADD PARTITION</codeph> is + primarily useful for importing data by moving or copying existing data files into the HDFS directory + corresponding to a partition. (You can use the <codeph>LOAD DATA</codeph> statement to move files into the + partition directory, or <codeph>ALTER TABLE ... PARTITION (...) SET LOCATION</codeph> to point a partition at + a directory that already contains data files. + </p> + + <p> + The <codeph>DROP PARTITION</codeph> clause is used to remove the HDFS directory and associated data files for + a particular set of partition key values; for example, if you always analyze the last 3 months worth of data, + at the beginning of each month you might drop the oldest partition that is no longer needed. Removing + partitions reduces the amount of metadata associated with the table and the complexity of calculating the + optimal query plan, which can simplify and speed up queries on partitioned tables, particularly join queries. + Here is an example showing the <codeph>ADD PARTITION</codeph> and <codeph>DROP PARTITION</codeph> clauses. + </p> + + <p> + To avoid errors while adding or dropping partitions whose existence is not certain, + add the optional <codeph>IF [NOT] EXISTS</codeph> clause between the <codeph>ADD</codeph> or + <codeph>DROP</codeph> keyword and the <codeph>PARTITION</codeph> keyword. That is, the entire + clause becomes <codeph>ADD IF NOT EXISTS PARTITION</codeph> or <codeph>DROP IF EXISTS PARTITION</codeph>. + The following example shows how partitions can be created automatically through <codeph>INSERT</codeph> + statements, or manually through <codeph>ALTER TABLE</codeph> statements. The <codeph>IF [NOT] EXISTS</codeph> + clauses let the <codeph>ALTER TABLE</codeph> statements succeed even if a new requested partition already + exists, or a partition to be dropped does not exist. + </p> + +<p> +Inserting 2 year values creates 2 partitions: +</p> + +<codeblock> +create table partition_t (s string) partitioned by (y int); +insert into partition_t (s,y) values ('two thousand',2000), ('nineteen ninety',1990); +show partitions partition_t; ++-------+-------+--------+------+--------------+-------------------+--------+-------------------+ +| y | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats | ++-------+-------+--------+------+--------------+-------------------+--------+-------+ +| 1990 | -1 | 1 | 16B | NOT CACHED | NOT CACHED | TEXT | false | +| 2000 | -1 | 1 | 13B | NOT CACHED | NOT CACHED | TEXT | false | +| Total | -1 | 2 | 29B | 0B | | | | ++-------+-------+--------+------+--------------+-------------------+--------+-------+ +</codeblock> + +<p> +Without the <codeph>IF NOT EXISTS</codeph> clause, an attempt to add a new partition might fail: +</p> + +<codeblock> +alter table partition_t add partition (y=2000); +ERROR: AnalysisException: Partition spec already exists: (y=2000). +</codeblock> + +<p> +The <codeph>IF NOT EXISTS</codeph> clause makes the statement succeed whether or not there was already a +partition with the specified key value: +</p> + +<codeblock> +alter table partition_t add if not exists partition (y=2000); +alter table partition_t add if not exists partition (y=2010); +show partitions partition_t; ++-------+-------+--------+------+--------------+-------------------+--------+-------------------+ +| y | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats | ++-------+-------+--------+------+--------------+-------------------+--------+-------+ +| 1990 | -1 | 1 | 16B | NOT CACHED | NOT CACHED | TEXT | false | +| 2000 | -1 | 1 | 13B | NOT CACHED | NOT CACHED | TEXT | false | +| 2010 | -1 | 0 | 0B | NOT CACHED | NOT CACHED | TEXT | false | +| Total | -1 | 2 | 29B | 0B | | | | ++-------+-------+--------+------+--------------+-------------------+--------+-------+ +</codeblock> + +<p> +Likewise, the <codeph>IF EXISTS</codeph> clause lets <codeph>DROP PARTITION</codeph> succeed whether or not the partition is already +in the table: +</p> + +<codeblock> +alter table partition_t drop if exists partition (y=2000); +alter table partition_t drop if exists partition (y=1950); +show partitions partition_t; ++-------+-------+--------+------+--------------+-------------------+--------+-------------------+ +| y | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | Incremental stats | ++-------+-------+--------+------+--------------+-------------------+--------+-------+ +| 1990 | -1 | 1 | 16B | NOT CACHED | NOT CACHED | TEXT | false | +| 2010 | -1 | 0 | 0B | NOT CACHED | NOT CACHED | TEXT | false | +| Total | -1 | 1 | 16B | 0B | | | | ++-------+-------+--------+------+--------------+-------------------+--------+-------+ +</codeblock> + + <p rev="2.3.0"> The optional <codeph>PURGE</codeph> keyword, available in + <keyword keyref="impala23_full"/> and higher, is used with the <codeph>DROP + PARTITION</codeph> clause to remove associated HDFS data files + immediately rather than going through the HDFS trashcan mechanism. Use + this keyword when dropping a partition if it is crucial to remove the data + as quickly as possible to free up space, or if there is a problem with the + trashcan, such as the trash cannot being configured or being in a + different HDFS encryption zone than the data files. </p> + + <!-- + To do: Make example more general by partitioning by year/month/day. + Then could show inserting into fixed year, variable month and day; + dropping particular year/month/day partition. + --> + +<codeblock>-- Create an empty table and define the partitioning scheme. +create table part_t (x int) partitioned by (month int); +-- Create an empty partition into which you could copy data files from some other source. +alter table part_t add partition (month=1); +-- After changing the underlying data, issue a REFRESH statement to make the data visible in Impala. +refresh part_t; +-- Later, do the same for the next month. +alter table part_t add partition (month=2); + +-- Now you no longer need the older data. +alter table part_t drop partition (month=1); +-- If the table was partitioned by month and year, you would issue a statement like: +-- alter table part_t drop partition (year=2003,month=1); +-- which would require 12 ALTER TABLE statements to remove a year's worth of data. + +-- If the data files for subsequent months were in a different file format, +-- you could set a different file format for the new partition as you create it. +alter table part_t add partition (month=3) set fileformat=parquet; +</codeblock> + + <p> + The value specified for a partition key can be an arbitrary constant expression, without any references to + columns. For example: + </p> + +<codeblock>alter table time_data add partition (month=concat('Decem','ber')); +alter table sales_data add partition (zipcode = cast(9021 * 10 as string));</codeblock> + + <note> + <p> + An alternative way to reorganize a table and its associated data files is to use <codeph>CREATE + TABLE</codeph> to create a variation of the original table, then use <codeph>INSERT</codeph> to copy the + transformed or reordered data to the new table. The advantage of <codeph>ALTER TABLE</codeph> is that it + avoids making a duplicate copy of the data files, allowing you to reorganize huge volumes of data in a + space-efficient way using familiar Hadoop techniques. + </p> + </note> + + <p> + <b>To switch a table between internal and external:</b> + </p> + + <p conref="../shared/impala_common.xml#common/switch_internal_external_table"/> + + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/permissions_blurb"/> + <p rev="CDH-19187"> + Most <codeph>ALTER TABLE</codeph> clauses do not actually + read or write any HDFS files, and so do not depend on + specific HDFS permissions. For example, the <codeph>SET FILEFORMAT</codeph> + clause does not actually check the file format existing data files or + convert them to the new format, and the <codeph>SET LOCATION</codeph> clause + does not require any special permissions on the new location. + (Any permission-related failures would come later, when you + actually query or insert into the table.) + </p> +<!-- Haven't rigorously tested all the assertions in the following paragraph. --> +<!-- Most testing so far has been around RENAME TO clause. --> + <p> + In general, <codeph>ALTER TABLE</codeph> clauses that do touch + HDFS files and directories require the same HDFS permissions + as corresponding <codeph>CREATE</codeph>, <codeph>INSERT</codeph>, + or <codeph>SELECT</codeph> statements. + The permissions allow + the user ID that the <cmdname>impalad</cmdname> daemon runs under, + typically the <codeph>impala</codeph> user, to read or write + files or directories, or (in the case of the execute bit) descend into a directory. + The <codeph>RENAME TO</codeph> clause requires read, write, and execute permission in the + source and destination database directories and in the table data directory, + and read and write permission for the data files within the table. + The <codeph>ADD PARTITION</codeph> and <codeph>DROP PARTITION</codeph> clauses + require write and execute permissions for the associated partition directory. + </p> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_tables.xml#tables"/>, + <xref href="impala_create_table.xml#create_table"/>, <xref href="impala_drop_table.xml#drop_table"/>, + <xref href="impala_partitioning.xml#partitioning"/>, <xref href="impala_tables.xml#internal_tables"/>, + <xref href="impala_tables.xml#external_tables"/> + </p> + </conbody> +</concept>
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_alter_view.xml ---------------------------------------------------------------------- diff --git a/docs/topics/impala_alter_view.xml b/docs/topics/impala_alter_view.xml new file mode 100644 index 0000000..baf539d --- /dev/null +++ b/docs/topics/impala_alter_view.xml @@ -0,0 +1,86 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept rev="1.1" id="alter_view"> + + <title>ALTER VIEW Statement</title> + <titlealts audience="PDF"><navtitle>ALTER VIEW</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="SQL"/> + <data name="Category" value="DDL"/> + <data name="Category" value="Tables"/> + <data name="Category" value="Schemas"/> + <data name="Category" value="Views"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + + <p> + <indexterm audience="Cloudera">ALTER VIEW statement</indexterm> + Changes the characteristics of a view. The syntax has two forms: + </p> + + <ul> + <li> + The <codeph>AS</codeph> clause associates the view with a different query. + </li> + <li> + The <codeph>RENAME TO</codeph> clause changes the name of the view, moves the view to + a different database, or both. + </li> + </ul> + + <p> + Because a view is purely a logical construct (an alias for a query) with no physical data behind it, + <codeph>ALTER VIEW</codeph> only involves changes to metadata in the metastore database, not any data files + in HDFS. + </p> + +<!-- View _permissions_ don't rely on underlying table. --> + +<!-- Could use views to grant access only to certain columns. --> + +<!-- Treated like a table for authorization. --> + +<!-- ALTER VIEW that queries another view - possibly a runtime error. --> + + <p conref="../shared/impala_common.xml#common/syntax_blurb"/> + +<codeblock>ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname> AS <varname>select_statement</varname> +ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname> RENAME TO [<varname>database_name</varname>.]<varname>view_name</varname></codeblock> + + <p conref="../shared/impala_common.xml#common/ddl_blurb"/> + + <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/> + + <p conref="../shared/impala_common.xml#common/security_blurb"/> + <p conref="../shared/impala_common.xml#common/redaction_yes"/> + + <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/> + + <p conref="../shared/impala_common.xml#common/example_blurb"/> + +<codeblock>create table t1 (x int, y int, s string); +create table t2 like t1; +create view v1 as select * from t1; +alter view v1 as select * from t2; +alter view v1 as select x, upper(s) s from t2;</codeblock> + +<!-- Repeat the same blurb + example to see the definition of a view, as in CREATE VIEW. --> + + <p conref="../shared/impala_common.xml#common/describe_formatted_view"/> + + <p conref="../shared/impala_common.xml#common/related_info"/> + + <p> + <xref href="impala_views.xml#views"/>, <xref href="impala_create_view.xml#create_view"/>, + <xref href="impala_drop_view.xml#drop_view"/> + </p> + </conbody> +</concept>
