[46/51] [partial] incubator-impala git commit: IMPALA-3398: Add docs to main Impala branch.

jbapple Thu, 17 Nov 2016 15:12:33 -0800

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_alter_table.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_alter_table.xml 
b/docs/topics/impala_alter_table.xml
new file mode 100644
index 0000000..61199cf
--- /dev/null
+++ b/docs/topics/impala_alter_table.xml
@@ -0,0 +1,806 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="alter_table">
+
+  <title>ALTER TABLE Statement</title>
+  <titlealts audience="PDF"><navtitle>ALTER TABLE</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Data Types"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="DDL"/>
+      <data name="Category" value="HDFS Caching"/>
+      <data name="Category" value="Tables"/>
+      <data name="Category" value="Schemas"/>
+      <data name="Category" value="S3"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">ALTER TABLE statement</indexterm>
+      The <codeph>ALTER TABLE</codeph> statement changes the structure or 
properties of an existing Impala table.
+    </p>
+    <p>
+      In Impala, this is primarily a logical operation that updates the table 
metadata in the metastore database that Impala
+      shares with Hive. Most <codeph>ALTER TABLE</codeph> operations do not 
actually rewrite, move, and so on the actual data
+      files. (The <codeph>RENAME TO</codeph> clause is the one exception; it 
can cause HDFS files to be moved to different paths.)
+      When you do an <codeph>ALTER TABLE</codeph> operation, you typically 
need to perform corresponding physical filesystem operations,
+      such as rewriting the data files to include extra fields, or converting 
them to a different file format.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>ALTER TABLE 
[<varname>old_db_name</varname>.]<varname>old_table_name</varname> RENAME TO 
[<varname>new_db_name</varname>.]<varname>new_table_name</varname>
+
+ALTER TABLE <varname>name</varname> ADD COLUMNS (<varname>col_spec</varname>[, 
<varname>col_spec</varname> ...])
+ALTER TABLE <varname>name</varname> DROP [COLUMN] 
<varname>column_name</varname>
+ALTER TABLE <varname>name</varname> CHANGE <varname>column_name</varname> 
<varname>new_name</varname> <varname>new_type</varname>
+ALTER TABLE <varname>name</varname> REPLACE COLUMNS 
(<varname>col_spec</varname>[, <varname>col_spec</varname> ...])
+
+ALTER TABLE <varname>name</varname> { ADD [IF NOT EXISTS] | DROP [IF EXISTS] } 
PARTITION (<varname>partition_spec</varname>) <ph rev="2.3.0">[PURGE]</ph>
+<ph rev="2.3.0 IMPALA-1568 CDH-36799">ALTER TABLE <varname>name</varname> 
RECOVER PARTITIONS</ph>
+
+ALTER TABLE <varname>name</varname> [PARTITION 
(<varname>partition_spec</varname>)]
+  SET { FILEFORMAT <varname>file_format</varname>
+  | LOCATION '<varname>hdfs_path_of_directory</varname>'
+  | TBLPROPERTIES (<varname>table_properties</varname>)
+  | SERDEPROPERTIES (<varname>serde_properties</varname>) }
+
+<ph rev="2.6.0 IMPALA-3369">ALTER TABLE <varname>name</varname> 
<varname>colname</varname>
+  ('<varname>statsKey</varname>'='<varname>val</varname>, ...)
+
+statsKey ::= numDVs | numNulls | avgSize | maxSize</ph>
+
+<ph rev="1.4.0">ALTER TABLE <varname>name</varname> [PARTITION 
(<varname>partition_spec</varname>)] SET { CACHED IN 
'<varname>pool_name</varname>' <ph rev="2.2.0">[WITH REPLICATION = 
<varname>integer</varname>]</ph> | UNCACHED }</ph>
+
+<varname>new_name</varname> ::= 
[<varname>new_database</varname>.]<varname>new_table_name</varname>
+
+<varname>col_spec</varname> ::= <varname>col_name</varname> 
<varname>type_name</varname>
+
+<varname>partition_spec</varname> ::= 
<varname>partition_col</varname>=<varname>constant_value</varname>
+
+<varname>table_properties</varname> ::= 
'<varname>name</varname>'='<varname>value</varname>'[, 
'<varname>name</varname>'='<varname>value</varname>' ...]
+
+<varname>serde_properties</varname> ::= 
'<varname>name</varname>'='<varname>value</varname>'[, 
'<varname>name</varname>'='<varname>value</varname>' ...]
+
+<varname>file_format</varname> ::= { PARQUET | TEXTFILE | RCFILE | 
SEQUENCEFILE | AVRO }
+</codeblock>
+
+    <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+    <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>
+
+    <p rev="2.3.0">
+      In <keyword keyref="impala23_full"/> and higher, the <codeph>ALTER 
TABLE</codeph> statement can
+      change the metadata for tables containing complex types 
(<codeph>ARRAY</codeph>,
+      <codeph>STRUCT</codeph>, and <codeph>MAP</codeph>).
+      For example, you can use an <codeph>ADD COLUMNS</codeph>, <codeph>DROP 
COLUMN</codeph>, or <codeph>CHANGE</codeph>
+      clause to modify the table layout for complex type columns.
+      Although Impala queries only work for complex type columns in Parquet 
tables, the complex type support in the
+      <codeph>ALTER TABLE</codeph> statement applies to all file formats.
+      For example, you can use Impala to update metadata for a staging table 
in a non-Parquet file format where the
+      data is populated by Hive. Or you can use <codeph>ALTER TABLE SET 
FILEFORMAT</codeph> to change the format
+      of an existing table to Parquet so that Impala can query it. Remember 
that changing the file format for a table does
+      not convert the data files within the table; you must prepare any 
Parquet data files containing complex types
+      outside Impala, and bring them into the table using <codeph>LOAD 
DATA</codeph> or updating the table's
+      <codeph>LOCATION</codeph> property.
+      See <xref href="impala_complex_types.xml#complex_types"/> for details 
about using complex types.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
+
+    <p>
+      Whenever you specify partitions in an <codeph>ALTER TABLE</codeph> 
statement, through the <codeph>PARTITION
+      (<varname>partition_spec</varname>)</codeph> clause, you must include 
all the partitioning columns in the
+      specification.
+    </p>
+
+    <p>
+      Most of the <codeph>ALTER TABLE</codeph> operations work the same for 
internal tables (managed by Impala) as
+      for external tables (with data files located in arbitrary locations). 
The exception is renaming a table; for
+      an external table, the underlying data directory is not renamed or moved.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/s3_blurb"/>
+
+    <p rev="2.6.0 CDH-39913 IMPALA-1878">
+      You can specify an <codeph>s3a://</codeph> prefix on the 
<codeph>LOCATION</codeph> attribute of a table or partition
+      to make Impala query data from the Amazon S3 filesystem. In <keyword 
keyref="impala26_full"/> and higher, Impala automatically
+      handles creating or removing the associated folders when you issue 
<codeph>ALTER TABLE</codeph> statements
+      with the <codeph>ADD PARTITION</codeph> or <codeph>DROP 
PARTITION</codeph> clauses.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/s3_ddl"/>
+
+    <p rev="1.4.0">
+      <b>HDFS caching (CACHED IN clause):</b>
+    </p>
+
+    <p rev="1.4.0">
+      If you specify the <codeph>CACHED IN</codeph> clause, any existing or 
future data files in the table
+      directory or the partition subdirectories are designated to be loaded 
into memory with the HDFS caching
+      mechanism. See <xref href="impala_perf_hdfs_caching.xml#hdfs_caching"/> 
for details about using the HDFS
+      caching feature.
+    </p>
+
+    <p 
conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/>
+
+    <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+    <p>
+      The following sections show examples of the use cases for various 
<codeph>ALTER TABLE</codeph> clauses.
+    </p>
+
+    <p>
+      <b>To rename a table (RENAME TO clause):</b>
+    </p>
+
+<!-- Beefing up the syntax in its original location up to, don't need to 
repeat it here.
+<codeblock>ALTER TABLE <varname>old_name</varname> RENAME TO 
<varname>new_name</varname>;</codeblock>
+-->
+
+    <p>
+      The <codeph>RENAME TO</codeph> clause lets you change the name of an 
existing table, and optionally which
+      database it is located in.
+    </p>
+
+    <p>
+      For internal tables, this operation physically renames the directory 
within HDFS that contains the data files;
+      the original directory name no longer exists. By qualifying the table 
names with database names, you can use
+      this technique to move an internal table (and its associated data 
directory) from one database to another.
+      For example:
+    </p>
+
+<codeblock>create database d1;
+create database d2;
+create database d3;
+use d1;
+create table mobile (x int);
+use d2;
+-- Move table from another database to the current one.
+alter table d1.mobile rename to mobile;
+use d1;
+-- Move table from one database to another.
+alter table d2.mobile rename to d3.mobile;</codeblock>
+
+    <p>
+      For external tables,
+    </p>
+
+    <p>
+      <b>To change the physical location where Impala looks for data files 
associated with a table or
+      partition:</b>
+    </p>
+
+<codeblock>ALTER TABLE <varname>table_name</varname> [PARTITION 
(<varname>partition_spec</varname>)] SET LOCATION 
'<varname>hdfs_path_of_directory</varname>';</codeblock>
+
+    <p>
+      The path you specify is the full HDFS path where the data files reside, 
or will be created. Impala does not
+      create any additional subdirectory named after the table. Impala does 
not move any data files to this new
+      location or change any data files that might already exist in that 
directory.
+    </p>
+
+    <p>
+      To set the location for a single partition, include the 
<codeph>PARTITION</codeph> clause. Specify all the
+      same partitioning columns for the table, with a constant value for each, 
to precisely identify the single
+      partition affected by the statement:
+    </p>
+
+<codeblock>create table p1 (s string) partitioned by (month int, day int);
+-- Each ADD PARTITION clause creates a subdirectory in HDFS.
+alter table p1 add partition (month=1, day=1);
+alter table p1 add partition (month=1, day=2);
+alter table p1 add partition (month=2, day=1);
+alter table p1 add partition (month=2, day=2);
+-- Redirect queries, INSERT, and LOAD DATA for one partition
+-- to a specific different directory.
+alter table p1 partition (month=1, day=1) set location 
'/usr/external_data/new_years_day';
+</codeblock>
+
+    <note 
conref="../shared/impala_common.xml#common/add_partition_set_location"/>
+
+    <p rev="2.3.0 IMPALA-1568 CDH-36799">
+      <b>To automatically detect new partition directories added through Hive 
or HDFS operations:</b>
+    </p>
+
+    <p rev="2.3.0 IMPALA-1568 CDH-36799">
+      In <keyword keyref="impala23_full"/> and higher, the <codeph>RECOVER 
PARTITIONS</codeph> clause scans
+      a partitioned table to detect if any new partition directories were 
added outside of Impala,
+      such as by Hive <codeph>ALTER TABLE</codeph> statements or by 
<cmdname>hdfs dfs</cmdname>
+      or <cmdname>hadoop fs</cmdname> commands. The <codeph>RECOVER 
PARTITIONS</codeph> clause
+      automatically recognizes any data files present in these new 
directories, the same as
+      the <codeph>REFRESH</codeph> statement does.
+    </p>
+
+    <p rev="2.3.0 IMPALA-1568 CDH-36799">
+      For example, here is a sequence of examples showing how you might create 
a partitioned table in Impala,
+      create new partitions through Hive, copy data files into the new 
partitions with the <cmdname>hdfs</cmdname>
+      command, and have Impala recognize the new partitions and new data:
+    </p>
+
+    <p rev="2.3.0 IMPALA-1568 CDH-36799">
+      In Impala, create the table, and a single partition for demonstration 
purposes:
+    </p>
+
+<codeblock rev="2.3.0 IMPALA-1568 CDH-36799">
+<![CDATA[
+create database recover_partitions;
+use recover_partitions;
+create table t1 (s string) partitioned by (yy int, mm int);
+insert into t1 partition (yy = 2016, mm = 1) values ('Partition exists');
+show files in t1;
++---------------------------------------------------------------------+------+--------------+
+| Path                                                                | Size | 
Partition    |
++---------------------------------------------------------------------+------+--------------+
+| /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1/data.txt | 17B  | 
yy=2016/mm=1 |
++---------------------------------------------------------------------+------+--------------+
+quit;
+]]>
+</codeblock>
+
+    <p rev="2.3.0 IMPALA-1568 CDH-36799">
+      In Hive, create some new partitions. In a real use case, you might 
create the
+      partitions and populate them with data as the final stages of an ETL 
pipeline.
+    </p>
+
+<codeblock rev="2.3.0 IMPALA-1568 CDH-36799">
+<![CDATA[
+hive> use recover_partitions;
+OK
+hive> alter table t1 add partition (yy = 2016, mm = 2);
+OK
+hive> alter table t1 add partition (yy = 2016, mm = 3);
+OK
+hive> quit;
+]]>
+</codeblock>
+
+    <p rev="2.3.0 IMPALA-1568 CDH-36799">
+      For demonstration purposes, manually copy data (a single row) into these
+      new partitions, using manual HDFS operations:
+    </p>
+
+<codeblock rev="2.3.0 IMPALA-1568 CDH-36799">
+<![CDATA[
+$ hdfs dfs -ls /user/hive/warehouse/recover_partitions.db/t1/yy=2016/
+Found 3 items
+drwxr-xr-x - impala   hive 0 2016-05-09 16:06 
/user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1
+drwxr-xr-x - jrussell hive 0 2016-05-09 16:14 
/user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=2
+drwxr-xr-x - jrussell hive 0 2016-05-09 16:13 
/user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=3
+
+$ hdfs dfs -cp 
/user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1/data.txt \
+  /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=2/data.txt
+$ hdfs dfs -cp 
/user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=1/data.txt \
+  /user/hive/warehouse/recover_partitions.db/t1/yy=2016/mm=3/data.txt
+]]>
+</codeblock>
+
+<codeblock rev="2.3.0 IMPALA-1568 CDH-36799">
+<![CDATA[
+hive> select * from t1;
+OK
+Partition exists  2016  1
+Partition exists  2016  2
+Partition exists  2016  3
+hive> quit;
+]]>
+</codeblock>
+
+    <p rev="2.3.0 IMPALA-1568 CDH-36799">
+      In Impala, initially the partitions and data are not visible.
+      Running <codeph>ALTER TABLE</codeph> with the <codeph>RECOVER 
PARTITIONS</codeph>
+      clause scans the table data directory to find any new partition 
directories, and
+      the data files inside them:
+    </p>
+
+<codeblock rev="2.3.0 IMPALA-1568 CDH-36799">
+<![CDATA[
+select * from t1;
++------------------+------+----+
+| s                | yy   | mm |
++------------------+------+----+
+| Partition exists | 2016 | 1  |
++------------------+------+----+
+
+alter table t1 recover partitions;
+select * from t1;
++------------------+------+----+
+| s                | yy   | mm |
++------------------+------+----+
+| Partition exists | 2016 | 1  |
+| Partition exists | 2016 | 3  |
+| Partition exists | 2016 | 2  |
++------------------+------+----+
+]]>
+</codeblock>
+
+    <p rev="1.2">
+      <b>To change the key-value pairs of the TBLPROPERTIES and 
SERDEPROPERTIES fields:</b>
+    </p>
+
+<codeblock>ALTER TABLE <varname>table_name</varname> SET TBLPROPERTIES 
('<varname>key1</varname>'='<varname>value1</varname>', 
'<varname>key2</varname>'='<varname>value2</varname>'[, ...]);
+ALTER TABLE <varname>table_name</varname> SET SERDEPROPERTIES 
('<varname>key1</varname>'='<varname>value1</varname>', 
'<varname>key2</varname>'='<varname>value2</varname>'[, ...]);</codeblock>
+
+    <p>
+      The <codeph>TBLPROPERTIES</codeph> clause is primarily a way to 
associate arbitrary user-specified data items
+      with a particular table.
+    </p>
+
+    <p>
+      The <codeph>SERDEPROPERTIES</codeph> clause sets up metadata defining 
how tables are read or written, needed
+      in some cases by Hive but not used extensively by Impala. You would use 
this clause primarily to change the
+      delimiter in an existing text table or partition, by setting the 
<codeph>'serialization.format'</codeph> and
+      <codeph>'field.delim'</codeph> property values to the new delimiter 
character:
+    </p>
+
+<codeblock>-- This table begins life as pipe-separated text format.
+create table change_to_csv (s1 string, s2 string) row format delimited fields 
terminated by '|';
+-- Then we change it to a CSV table.
+alter table change_to_csv set SERDEPROPERTIES ('serialization.format'=',', 
'field.delim'=',');
+insert overwrite change_to_csv values ('stop','go'), ('yes','no');
+!hdfs dfs -cat 
'hdfs://<varname>hostname</varname>:8020/<varname>data_directory</varname>/<varname>dbname</varname>.db/change_to_csv/<varname>data_file</varname>';
+stop,go
+yes,no</codeblock>
+
+    <p>
+      Use the <codeph>DESCRIBE FORMATTED</codeph> statement to see the current 
values of these properties for an
+      existing table. See <xref href="impala_create_table.xml#create_table"/> 
for more details about these clauses.
+      See <xref href="impala_perf_stats.xml#perf_table_stats_manual"/> for an 
example of using table properties to
+      fine-tune the performance-related table statistics.
+    </p>
+      
+    <p>
+      <b>To manually set or update table or column statistics:</b>
+    </p>
+
+    <p>
+      Although for most tables the <codeph>COMPUTE STATS</codeph> or 
<codeph>COMPUTE INCREMENTAL STATS</codeph>
+      statement is all you need to keep table and column statistics up to date 
for a table,
+      sometimes for a very large table or one that is updated frequently, the 
length of time to recompute
+      all the statistics might make it impractical to run those statements as 
often as needed.
+      As a workaround, you can use the <codeph>ALTER TABLE</codeph> statement 
to set table statistics
+      at the level of the entire table or a single partition, or column 
statistics at the level of
+      the entire table.
+    </p>
+
+    <p>
+      You can set the <codeph>numrows</codeph> value for table statistics by 
changing the
+      <codeph>TBLPROPERTIES</codeph> setting for a table or partition.
+      For example:
+<codeblock conref="../shared/impala_common.xml#common/set_numrows_example"/>
+<codeblock 
conref="../shared/impala_common.xml#common/set_numrows_partitioned_example"/>
+      See <xref href="impala_perf_stats.xml#perf_table_stats_manual"/> for 
details.
+    </p>
+
+    <p rev="2.6.0 IMPALA-3369">
+      In <keyword keyref="impala26_full"/> and higher, you can use the 
<codeph>SET COLUMN STATS</codeph> clause
+      to set a specific stats value for a particular column.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/set_column_stats_example"/>
+
+    <p>
+      <b>To reorganize columns for a table:</b>
+    </p>
+
+<codeblock>ALTER TABLE <varname>table_name</varname> ADD COLUMNS 
(<varname>column_defs</varname>);
+ALTER TABLE <varname>table_name</varname> REPLACE COLUMNS 
(<varname>column_defs</varname>);
+ALTER TABLE <varname>table_name</varname> CHANGE 
<varname>column_name</varname> <varname>new_name</varname> 
<varname>new_type</varname>;
+ALTER TABLE <varname>table_name</varname> DROP 
<varname>column_name</varname>;</codeblock>
+
+    <p>
+      The <varname>column_spec</varname> is the same as in the <codeph>CREATE 
TABLE</codeph> statement: the column
+      name, then its data type, then an optional comment. You can add multiple 
columns at a time. The parentheses
+      are required whether you add a single column or multiple columns. When 
you replace columns, all the original
+      column definitions are discarded. You might use this technique if you 
receive a new set of data files with
+      different data types or columns in a different order. (The data files 
are retained, so if the new columns are
+      incompatible with the old ones, use <codeph>INSERT OVERWRITE</codeph> or 
<codeph>LOAD DATA OVERWRITE</codeph>
+      to replace all the data before issuing any further queries.)
+    </p>
+
+    <p rev="CDH-37178">
+      For example, here is how you might add columns to an existing table.
+      The first <codeph>ALTER TABLE</codeph> adds two new columns, and the 
second
+      <codeph>ALTER TABLE</codeph> adds one new column.
+      A single Impala query reads both the old and new data files, containing 
different numbers of columns.
+      For any columns not present in a particular data file, all the column 
values are
+      considered to be <codeph>NULL</codeph>.
+    </p>
+
+<codeblock rev="CDH-37178">
+create table t1 (x int);
+insert into t1 values (1), (2);
+
+alter table t1 add columns (s string, t timestamp);
+insert into t1 values (3, 'three', now());
+
+alter table t1 add columns (b boolean);
+insert into t1 values (4, 'four', now(), true);
+
+select * from t1 order by x;
++---+-------+-------------------------------+------+
+| x | s     | t                             | b    |
++---+-------+-------------------------------+------+
+| 1 | NULL  | NULL                          | NULL |
+| 2 | NULL  | NULL                          | NULL |
+| 3 | three | 2016-05-11 11:19:45.054457000 | NULL |
+| 4 | four  | 2016-05-11 11:20:20.260733000 | true |
++---+-------+-------------------------------+------+
+</codeblock>
+
+    <p>
+      You might use the <codeph>CHANGE</codeph> clause to rename a single 
column, or to treat an existing column as
+      a different type than before, such as to switch between treating a 
column as <codeph>STRING</codeph> and
+      <codeph>TIMESTAMP</codeph>, or between <codeph>INT</codeph> and 
<codeph>BIGINT</codeph>. You can only drop a
+      single column at a time; to drop multiple columns, issue multiple 
<codeph>ALTER TABLE</codeph> statements, or
+      define the new set of columns with a single <codeph>ALTER TABLE ... 
REPLACE COLUMNS</codeph> statement.
+    </p>
+
+    <p rev="CDH-37178">
+      The following examples show some safe operations to drop or change 
columns. Dropping the final column
+      in a table lets Impala ignore the data causing any disruption to 
existing data files. Changing the type
+      of a column works if existing data values can be safely converted to the 
new type. The type conversion
+      rules depend on the file format of the underlying table. For example, in 
a text table, the same value
+      can be interpreted as a <codeph>STRING</codeph> or a numeric value, 
while in a binary format such as
+      Parquet, the rules are stricter and type conversions only work between 
certain sizes of integers.
+    </p>
+
+<codeblock rev="CDH-37178">
+create table optional_columns (x int, y int, z int, a1 int, a2 int);
+insert into optional_columns values (1,2,3,0,0), (2,3,4,100,100);
+
+-- When the last column in the table is dropped, Impala ignores the
+-- values that are no longer needed. (Dropping A1 but leaving A2
+-- would cause problems, as we will see in a subsequent example.)
+alter table optional_columns drop column a2;
+alter table optional_columns drop column a1;
+
+select * from optional_columns;
++---+---+---+
+| x | y | z |
++---+---+---+
+| 1 | 2 | 3 |
+| 2 | 3 | 4 |
++---+---+---+
+</codeblock>
+
+<codeblock rev="CDH-37178">
+create table int_to_string (s string, x int);
+insert into int_to_string values ('one', 1), ('two', 2);
+
+-- What was an INT column will now be interpreted as STRING.
+-- This technique works for text tables but not other file formats.
+-- The second X represents the new name of the column, which we keep the same.
+alter table int_to_string change x x string;
+
+-- Once the type is changed, we can insert non-integer values into the X column
+-- and treat that column as a string, for example by uppercasing or 
concatenating.
+insert into int_to_string values ('three', 'trois');
+select s, upper(x) from int_to_string;
++-------+----------+
+| s     | upper(x) |
++-------+----------+
+| one   | 1        |
+| two   | 2        |
+| three | TROIS    |
++-------+----------+
+</codeblock>
+
+    <p rev="CDH-37178">
+      Remember that Impala does not actually do any conversion for the 
underlying data files as a result of
+      <codeph>ALTER TABLE</codeph> statements. If you use <codeph>ALTER 
TABLE</codeph> to create a table
+      layout that does not agree with the contents of the underlying files, 
you must replace the files
+      yourself, such as using <codeph>LOAD DATA</codeph> to load a new set of 
data files, or
+      <codeph>INSERT OVERWRITE</codeph> to copy from another table and replace 
the original data.
+    </p>
+
+    <p rev="CDH-37178">
+      The following example shows what happens if you delete the middle column 
from a Parquet table containing three columns.
+      The underlying data files still contain three columns of data. Because 
the columns are interpreted based on their positions in
+      the data file instead of the specific column names, a <codeph>SELECT 
*</codeph> query now reads the first and second
+      columns from the data file, potentially leading to unexpected results or 
conversion errors.
+      For this reason, if you expect to someday drop a column, declare it as 
the last column in the table, where its data
+      can be ignored by queries after the column is dropped. Or, re-run your 
ETL process and create new data files
+      if you drop or change the type of a column in a way that causes problems 
with existing data files.
+    </p>
+
+<codeblock rev="CDH-37178">
+-- Parquet table showing how dropping a column can produce unexpected results.
+create table p1 (s1 string, s2 string, s3 string) stored as parquet;
+
+insert into p1 values ('one', 'un', 'uno'), ('two', 'deux', 'dos'),
+  ('three', 'trois', 'tres');
+select * from p1;
++-------+-------+------+
+| s1    | s2    | s3   |
++-------+-------+------+
+| one   | un    | uno  |
+| two   | deux  | dos  |
+| three | trois | tres |
++-------+-------+------+
+
+alter table p1 drop column s2;
+-- The S3 column contains unexpected results.
+-- Because S2 and S3 have compatible types, the query reads
+-- values from the dropped S2, because the existing data files
+-- still contain those values as the second column.
+select * from p1;
++-------+-------+
+| s1    | s3    |
++-------+-------+
+| one   | un    |
+| two   | deux  |
+| three | trois |
++-------+-------+
+</codeblock>
+
+<codeblock rev="CDH-37178">
+-- Parquet table showing how dropping a column can produce conversion errors.
+create table p2 (s1 string, x int, s3 string) stored as parquet;
+
+insert into p2 values ('one', 1, 'uno'), ('two', 2, 'dos'), ('three', 3, 
'tres');
+select * from p2;
++-------+---+------+
+| s1    | x | s3   |
++-------+---+------+
+| one   | 1 | uno  |
+| two   | 2 | dos  |
+| three | 3 | tres |
++-------+---+------+
+
+alter table p2 drop column x;
+select * from p2;
+WARNINGS: 
+File '<varname>hdfs_filename</varname>' has an incompatible Parquet schema for 
column 'add_columns.p2.s3'.
+Column type: STRING, Parquet schema:
+optional int32 x [i:1 d:1 r:0]
+
+File '<varname>hdfs_filename</varname>' has an incompatible Parquet schema for 
column 'add_columns.p2.s3'.
+Column type: STRING, Parquet schema:
+optional int32 x [i:1 d:1 r:0]
+</codeblock>
+
+    <p rev="IMPALA-3092">
+      In <keyword keyref="impala26_full"/> and higher, if an Avro table is 
created without column definitions in the
+      <codeph>CREATE TABLE</codeph> statement, and columns are later
+      added through <codeph>ALTER TABLE</codeph>, the resulting
+      table is now queryable. Missing values from the newly added
+      columns now default to <codeph>NULL</codeph>.
+    </p>
+
+    <p>
+      <b>To change the file format that Impala expects data to be in, for a 
table or partition:</b>
+    </p>
+
+    <p>
+      Use an <codeph>ALTER TABLE ... SET FILEFORMAT</codeph> clause. You can 
include an optional <codeph>PARTITION
+      (<varname>col1</varname>=<varname>val1</varname>, 
<varname>col2</varname>=<varname>val2</varname>,
+      ...</codeph> clause so that the file format is changed for a specific 
partition rather than the entire table.
+    </p>
+
+    <p>
+      Because this operation only changes the table metadata, you must do any 
conversion of existing data using
+      regular Hadoop techniques outside of Impala. Any new data created by the 
Impala <codeph>INSERT</codeph>
+      statement will be in the new format. You cannot specify the delimiter 
for Text files; the data files must be
+      comma-delimited.
+<!-- Although Impala can read Avro tables
+        created through Hive, you cannot specify the Avro file format in an 
Impala
+        <codeph>ALTER TABLE</codeph> statement. -->
+    </p>
+
+    <p>
+      To set the file format for a single partition, include the 
<codeph>PARTITION</codeph> clause. Specify all the
+      same partitioning columns for the table, with a constant value for each, 
to precisely identify the single
+      partition affected by the statement:
+    </p>
+
+<codeblock>create table p1 (s string) partitioned by (month int, day int);
+-- Each ADD PARTITION clause creates a subdirectory in HDFS.
+alter table p1 add partition (month=1, day=1);
+alter table p1 add partition (month=1, day=2);
+alter table p1 add partition (month=2, day=1);
+alter table p1 add partition (month=2, day=2);
+-- Queries and INSERT statements will read and write files
+-- in this format for this specific partition.
+alter table p1 partition (month=2, day=2) set fileformat parquet;
+</codeblock>
+
+    <p>
+      <b>To add or drop partitions for a table</b>, the table must already be 
partitioned (that is, created with a
+      <codeph>PARTITIONED BY</codeph> clause). The partition is a physical 
directory in HDFS, with a name that
+      encodes a particular column value (the <b>partition key</b>). The Impala 
<codeph>INSERT</codeph> statement
+      already creates the partition if necessary, so the <codeph>ALTER TABLE 
... ADD PARTITION</codeph> is
+      primarily useful for importing data by moving or copying existing data 
files into the HDFS directory
+      corresponding to a partition. (You can use the <codeph>LOAD 
DATA</codeph> statement to move files into the
+      partition directory, or <codeph>ALTER TABLE ... PARTITION (...) SET 
LOCATION</codeph> to point a partition at
+      a directory that already contains data files.
+    </p>
+
+    <p>
+      The <codeph>DROP PARTITION</codeph> clause is used to remove the HDFS 
directory and associated data files for
+      a particular set of partition key values; for example, if you always 
analyze the last 3 months worth of data,
+      at the beginning of each month you might drop the oldest partition that 
is no longer needed. Removing
+      partitions reduces the amount of metadata associated with the table and 
the complexity of calculating the
+      optimal query plan, which can simplify and speed up queries on 
partitioned tables, particularly join queries.
+      Here is an example showing the <codeph>ADD PARTITION</codeph> and 
<codeph>DROP PARTITION</codeph> clauses.
+    </p>
+
+    <p>
+      To avoid errors while adding or dropping partitions whose existence is 
not certain,
+      add the optional <codeph>IF [NOT] EXISTS</codeph> clause between the 
<codeph>ADD</codeph> or
+      <codeph>DROP</codeph> keyword and the <codeph>PARTITION</codeph> 
keyword. That is, the entire
+      clause becomes <codeph>ADD IF NOT EXISTS PARTITION</codeph> or 
<codeph>DROP IF EXISTS PARTITION</codeph>.
+      The following example shows how partitions can be created automatically 
through <codeph>INSERT</codeph>
+      statements, or manually through <codeph>ALTER TABLE</codeph> statements. 
The <codeph>IF [NOT] EXISTS</codeph>
+      clauses let the <codeph>ALTER TABLE</codeph> statements succeed even if 
a new requested partition already
+      exists, or a partition to be dropped does not exist.
+    </p>
+
+<p>
+Inserting 2 year values creates 2 partitions:
+</p>
+
+<codeblock>
+create table partition_t (s string) partitioned by (y int);
+insert into partition_t (s,y) values ('two thousand',2000), ('nineteen 
ninety',1990);
+show partitions partition_t;
++-------+-------+--------+------+--------------+-------------------+--------+-------------------+
+| y     | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | 
Incremental stats |
++-------+-------+--------+------+--------------+-------------------+--------+-------+
+| 1990  | -1    | 1      | 16B  | NOT CACHED   | NOT CACHED        | TEXT   | 
false |
+| 2000  | -1    | 1      | 13B  | NOT CACHED   | NOT CACHED        | TEXT   | 
false |
+| Total | -1    | 2      | 29B  | 0B           |                   |        |  
     |
++-------+-------+--------+------+--------------+-------------------+--------+-------+
+</codeblock>
+
+<p>
+Without the <codeph>IF NOT EXISTS</codeph> clause, an attempt to add a new 
partition might fail:
+</p>
+
+<codeblock>
+alter table partition_t add partition (y=2000);
+ERROR: AnalysisException: Partition spec already exists: (y=2000).
+</codeblock>
+
+<p>
+The <codeph>IF NOT EXISTS</codeph> clause makes the statement succeed whether 
or not there was already a
+partition with the specified key value:
+</p>
+
+<codeblock>
+alter table partition_t add if not exists partition (y=2000);
+alter table partition_t add if not exists partition (y=2010);
+show partitions partition_t;
++-------+-------+--------+------+--------------+-------------------+--------+-------------------+
+| y     | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | 
Incremental stats |
++-------+-------+--------+------+--------------+-------------------+--------+-------+
+| 1990  | -1    | 1      | 16B  | NOT CACHED   | NOT CACHED        | TEXT   | 
false |
+| 2000  | -1    | 1      | 13B  | NOT CACHED   | NOT CACHED        | TEXT   | 
false |
+| 2010  | -1    | 0      | 0B   | NOT CACHED   | NOT CACHED        | TEXT   | 
false |
+| Total | -1    | 2      | 29B  | 0B           |                   |        |  
     |
++-------+-------+--------+------+--------------+-------------------+--------+-------+
+</codeblock>
+
+<p>
+Likewise, the <codeph>IF EXISTS</codeph> clause lets <codeph>DROP 
PARTITION</codeph> succeed whether or not the partition is already
+in the table:
+</p>
+
+<codeblock>
+alter table partition_t drop if exists partition (y=2000);
+alter table partition_t drop if exists partition (y=1950);
+show partitions partition_t;
++-------+-------+--------+------+--------------+-------------------+--------+-------------------+
+| y     | #Rows | #Files | Size | Bytes Cached | Cache Replication | Format | 
Incremental stats |
++-------+-------+--------+------+--------------+-------------------+--------+-------+
+| 1990  | -1    | 1      | 16B  | NOT CACHED   | NOT CACHED        | TEXT   | 
false |
+| 2010  | -1    | 0      | 0B   | NOT CACHED   | NOT CACHED        | TEXT   | 
false |
+| Total | -1    | 1      | 16B  | 0B           |                   |        |  
     |
++-------+-------+--------+------+--------------+-------------------+--------+-------+
+</codeblock>
+
+    <p rev="2.3.0"> The optional <codeph>PURGE</codeph> keyword, available in
+      <keyword keyref="impala23_full"/> and higher, is used with the 
<codeph>DROP
+        PARTITION</codeph> clause to remove associated HDFS data files
+      immediately rather than going through the HDFS trashcan mechanism. Use
+      this keyword when dropping a partition if it is crucial to remove the 
data
+      as quickly as possible to free up space, or if there is a problem with 
the
+      trashcan, such as the trash cannot being configured or being in a
+      different HDFS encryption zone than the data files. </p>
+
+    <!--
+        To do: Make example more general by partitioning by year/month/day.
+        Then could show inserting into fixed year, variable month and day;
+        dropping particular year/month/day partition.
+    -->
+
+<codeblock>-- Create an empty table and define the partitioning scheme.
+create table part_t (x int) partitioned by (month int);
+-- Create an empty partition into which you could copy data files from some 
other source.
+alter table part_t add partition (month=1);
+-- After changing the underlying data, issue a REFRESH statement to make the 
data visible in Impala.
+refresh part_t;
+-- Later, do the same for the next month.
+alter table part_t add partition (month=2);
+
+-- Now you no longer need the older data.
+alter table part_t drop partition (month=1);
+-- If the table was partitioned by month and year, you would issue a statement 
like:
+-- alter table part_t drop partition (year=2003,month=1);
+-- which would require 12 ALTER TABLE statements to remove a year's worth of 
data.
+
+-- If the data files for subsequent months were in a different file format,
+-- you could set a different file format for the new partition as you create 
it.
+alter table part_t add partition (month=3) set fileformat=parquet;
+</codeblock>
+
+    <p>
+      The value specified for a partition key can be an arbitrary constant 
expression, without any references to
+      columns. For example:
+    </p>
+
+<codeblock>alter table time_data add partition (month=concat('Decem','ber'));
+alter table sales_data add partition (zipcode = cast(9021 * 10 as 
string));</codeblock>
+
+    <note>
+      <p>
+        An alternative way to reorganize a table and its associated data files 
is to use <codeph>CREATE
+        TABLE</codeph> to create a variation of the original table, then use 
<codeph>INSERT</codeph> to copy the
+        transformed or reordered data to the new table. The advantage of 
<codeph>ALTER TABLE</codeph> is that it
+        avoids making a duplicate copy of the data files, allowing you to 
reorganize huge volumes of data in a
+        space-efficient way using familiar Hadoop techniques.
+      </p>
+    </note>
+
+    <p>
+      <b>To switch a table between internal and external:</b>
+    </p>
+
+    <p 
conref="../shared/impala_common.xml#common/switch_internal_external_table"/>
+
+    <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+    <p conref="../shared/impala_common.xml#common/permissions_blurb"/>
+    <p rev="CDH-19187">
+      Most <codeph>ALTER TABLE</codeph> clauses do not actually
+      read or write any HDFS files, and so do not depend on
+      specific HDFS permissions. For example, the <codeph>SET 
FILEFORMAT</codeph>
+      clause does not actually check the file format existing data files or
+      convert them to the new format, and the <codeph>SET LOCATION</codeph> 
clause
+      does not require any special permissions on the new location.
+      (Any permission-related failures would come later, when you
+      actually query or insert into the table.)
+    </p>
+<!-- Haven't rigorously tested all the assertions in the following paragraph. 
-->
+<!-- Most testing so far has been around RENAME TO clause. -->
+    <p>
+      In general, <codeph>ALTER TABLE</codeph> clauses that do touch
+      HDFS files and directories require the same HDFS permissions
+      as corresponding <codeph>CREATE</codeph>, <codeph>INSERT</codeph>,
+      or <codeph>SELECT</codeph> statements.
+      The permissions allow
+      the user ID that the <cmdname>impalad</cmdname> daemon runs under,
+      typically the <codeph>impala</codeph> user, to read or write
+      files or directories, or (in the case of the execute bit) descend into a 
directory.
+      The <codeph>RENAME TO</codeph> clause requires read, write, and execute 
permission in the
+      source and destination database directories and in the table data 
directory,
+      and read and write permission for the data files within the table.
+      The <codeph>ADD PARTITION</codeph> and <codeph>DROP PARTITION</codeph> 
clauses
+      require write and execute permissions for the associated partition 
directory.
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+
+    <p>
+      <xref href="impala_tables.xml#tables"/>,
+      <xref href="impala_create_table.xml#create_table"/>, <xref 
href="impala_drop_table.xml#drop_table"/>,
+      <xref href="impala_partitioning.xml#partitioning"/>, <xref 
href="impala_tables.xml#internal_tables"/>,
+      <xref href="impala_tables.xml#external_tables"/>
+    </p>
+  </conbody>
+</concept>


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/3be0f122/docs/topics/impala_alter_view.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_alter_view.xml 
b/docs/topics/impala_alter_view.xml
new file mode 100644
index 0000000..baf539d
--- /dev/null
+++ b/docs/topics/impala_alter_view.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="alter_view">
+
+  <title>ALTER VIEW Statement</title>
+  <titlealts audience="PDF"><navtitle>ALTER VIEW</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="DDL"/>
+      <data name="Category" value="Tables"/>
+      <data name="Category" value="Schemas"/>
+      <data name="Category" value="Views"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">ALTER VIEW statement</indexterm>
+      Changes the characteristics of a view. The syntax has two forms:
+    </p>
+
+    <ul>
+      <li>
+        The <codeph>AS</codeph> clause associates the view with a different 
query.
+      </li>
+      <li>
+        The <codeph>RENAME TO</codeph> clause changes the name of the view, 
moves the view to
+        a different database, or both.
+      </li>
+    </ul>
+
+    <p>
+      Because a view is purely a logical construct (an alias for a query) with 
no physical data behind it,
+      <codeph>ALTER VIEW</codeph> only involves changes to metadata in the 
metastore database, not any data files
+      in HDFS.
+    </p>
+
+<!-- View _permissions_ don't rely on underlying table. -->
+
+<!-- Could use views to grant access only to certain columns. -->
+
+<!-- Treated like a table for authorization. -->
+
+<!-- ALTER VIEW that queries another view - possibly a runtime error. -->
+
+    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>ALTER VIEW 
[<varname>database_name</varname>.]<varname>view_name</varname> AS 
<varname>select_statement</varname>
+ALTER VIEW [<varname>database_name</varname>.]<varname>view_name</varname> 
RENAME TO 
[<varname>database_name</varname>.]<varname>view_name</varname></codeblock>
+
+    <p conref="../shared/impala_common.xml#common/ddl_blurb"/>
+
+    <p conref="../shared/impala_common.xml#common/sync_ddl_blurb"/>
+
+    <p conref="../shared/impala_common.xml#common/security_blurb"/>
+    <p conref="../shared/impala_common.xml#common/redaction_yes"/>
+
+    <p conref="../shared/impala_common.xml#common/cancel_blurb_no"/>
+
+    <p conref="../shared/impala_common.xml#common/permissions_blurb_no"/>
+
+    <p conref="../shared/impala_common.xml#common/example_blurb"/>
+
+<codeblock>create table t1 (x int, y int, s string);
+create table t2 like t1;
+create view v1 as select * from t1;
+alter view v1 as select * from t2;
+alter view v1 as select x, upper(s) s from t2;</codeblock>
+
+<!-- Repeat the same blurb + example to see the definition of a view, as in 
CREATE VIEW. -->
+
+    <p conref="../shared/impala_common.xml#common/describe_formatted_view"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+
+    <p>
+      <xref href="impala_views.xml#views"/>, <xref 
href="impala_create_view.xml#create_view"/>,
+      <xref href="impala_drop_view.xml#drop_view"/>
+    </p>
+  </conbody>
+</concept>

[46/51] [partial] incubator-impala git commit: IMPALA-3398: Add docs to main Impala branch.

Reply via email to