[3/7] incubator-impala git commit: New files needed to make PDF build happy.

jrussell Fri, 28 Oct 2016 17:34:12 -0700

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_ports.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_ports.xml b/docs/topics/impala_ports.xml
new file mode 100644
index 0000000..ba57ede
--- /dev/null
+++ b/docs/topics/impala_ports.xml
@@ -0,0 +1,440 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="ports">
+
+  <title>Ports Used by Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Ports"/>
+      <data name="Category" value="Network"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody id="conbody_ports">
+
+    <p>
+      <indexterm audience="Cloudera">ports</indexterm>
+      Impala uses the TCP ports listed in the following table. Before 
deploying Impala, ensure these ports are open
+      on each system.
+    </p>
+
+    <table>
+      <tgroup cols="5">
+        <colspec colname="1" colwidth="20*"/>
+        <colspec colname="2" colwidth="30*"/>
+        <colspec colname="3" colwidth="10*"/>
+        <colspec colname="4" colwidth="20*"/>
+        <colspec colname="5" colwidth="30*"/>
+        <thead>
+          <row>
+            <entry>
+              Component
+            </entry>
+            <entry>
+              Service
+            </entry>
+            <entry>
+              Port
+            </entry>
+            <entry>
+              Access Requirement
+            </entry>
+            <entry>
+              Comment
+            </entry>
+          </row>
+        </thead>
+        <tbody>
+          <row>
+            <entry>
+              <p>
+                Impala Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Impala Daemon Frontend Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                21000
+              </p>
+            </entry>
+            <entry>
+              <p>
+                External
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Used to transmit commands and receive results by 
<codeph>impala-shell</codeph> and
+                version 1.2 of the Cloudera ODBC driver.
+              </p>
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <p>
+                Impala Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Impala Daemon Frontend Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                21050
+              </p>
+            </entry>
+            <entry>
+              <p>
+                External
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Used to transmit commands and receive results by applications, 
such as Business Intelligence tools,
+                using JDBC, the Beeswax query editor in Hue, and version 2.0 
or higher of the Cloudera ODBC driver.
+              </p>
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <p>
+                Impala Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Impala Daemon Backend Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                22000
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal use only. Impala daemons use this port to communicate 
with each other.
+              </p>
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <p>
+                Impala Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                StateStoreSubscriber Service Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                23000
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal use only. Impala daemons listen on this port for 
updates from the statestore daemon.
+              </p>
+            </entry>
+          </row>
+          <row rev="2.1.0">
+            <entry>
+              <p>
+                Catalog Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                StateStoreSubscriber Service Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                23020
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal use only. The catalog daemon listens on this port for 
updates from the statestore daemon.
+              </p>
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <p>
+                Impala Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Impala Daemon HTTP Server Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                25000
+              </p>
+            </entry>
+            <entry>
+              <p>
+                External
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Impala web interface for administrators to monitor and 
troubleshoot.
+              </p>
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <p>
+                Impala StateStore Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                StateStore HTTP Server Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                25010
+              </p>
+            </entry>
+            <entry>
+              <p>
+                External
+              </p>
+            </entry>
+            <entry>
+              <p>
+                StateStore web interface for administrators to monitor and 
troubleshoot.
+              </p>
+            </entry>
+          </row>
+          <row rev="1.2">
+            <entry>
+              <p>
+                Impala Catalog Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Catalog HTTP Server Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                25020
+              </p>
+            </entry>
+            <entry>
+              <p>
+                External
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Catalog service web interface for administrators to monitor 
and troubleshoot. New in Impala 1.2 and
+                higher.
+              </p>
+            </entry>
+          </row>
+          <row>
+            <entry>
+              <p>
+                Impala StateStore Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                StateStore Service Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                24000
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal use only. The statestore daemon listens on this port 
for registration/unregistration
+                requests.
+              </p>
+            </entry>
+          </row>
+          <row rev="1.2">
+            <entry>
+              <p>
+                Impala Catalog Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                StateStore Service Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                26000
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal use only. The catalog service uses this port to 
communicate with the Impala daemons. New
+                in Impala 1.2 and higher.
+              </p>
+            </entry>
+          </row>
+          <row rev="1.3.0">
+            <entry>
+              <p>
+                Impala Daemon
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Llama Callback Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                28000
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal use only. Impala daemons use to communicate with 
Llama. New in CDH 5.0.0 and higher.
+              </p>
+            </entry>
+          </row>
+          <row rev="1.3.0">
+            <entry>
+              <p>
+                Impala Llama ApplicationMaster
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Llama Thrift Admin Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                15002
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal use only. New in CDH 5.0.0 and higher.
+              </p>
+            </entry>
+          </row>
+          <row rev="1.3.0">
+            <entry>
+              <p>
+                Impala Llama ApplicationMaster
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Llama Thrift Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                15000
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Internal use only. New in CDH 5.0.0 and higher.
+              </p>
+            </entry>
+          </row>
+          <row rev="1.3.0">
+            <entry>
+              <p>
+                Impala Llama ApplicationMaster
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Llama HTTP Port
+              </p>
+            </entry>
+            <entry>
+              <p>
+                15001
+              </p>
+            </entry>
+            <entry>
+              <p>
+                External
+              </p>
+            </entry>
+            <entry>
+              <p>
+                Llama service web interface for administrators to monitor and 
troubleshoot. New in CDH 5.0.0 and
+                higher.
+              </p>
+            </entry>
+          </row>
+        </tbody>
+      </tgroup>
+    </table>
+  </conbody>
+</concept>


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_proxy.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_proxy.xml b/docs/topics/impala_proxy.xml
new file mode 100644
index 0000000..84511c7
--- /dev/null
+++ b/docs/topics/impala_proxy.xml
@@ -0,0 +1,635 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="proxy">
+
+  <title>Using Impala through a Proxy for High Availability</title>
+  <titlealts audience="PDF"><navtitle>Load-Balancing Proxy for 
HA</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="High Availability"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Network"/>
+      <data name="Category" value="Proxy"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      For most clusters that have multiple users and production availability 
requirements, you might set up a proxy
+      server to relay requests to and from Impala.
+    </p>
+
+    <p>
+      Currently, the Impala statestore mechanism does not include such 
proxying and load-balancing features. Set up
+      a software package of your choice to perform these functions.
+    </p>
+
+    <note>
+      <p 
conref="../shared/impala_common.xml#common/statestored_catalogd_ha_blurb"/>
+    </note>
+
+    <p outputclass="toc inpage"/>
+
+  </conbody>
+
+  <concept id="proxy_overview">
+
+    <title>Overview of Proxy Usage and Load Balancing for Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Concepts"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        Using a load-balancing proxy server for Impala has the following 
advantages:
+      </p>
+
+      <ul>
+        <li>
+          Applications connect to a single well-known host and port, rather 
than keeping track of the hosts where
+          the <cmdname>impalad</cmdname> daemon is running.
+        </li>
+
+        <li>
+          If any host running the <cmdname>impalad</cmdname> daemon becomes 
unavailable, application connection
+          requests still succeed because you always connect to the proxy 
server rather than a specific host running
+          the <cmdname>impalad</cmdname> daemon.
+        </li>
+
+        <li>
+          The coordinator node for each Impala query potentially requires more 
memory and CPU cycles than the other
+          nodes that process the query. The proxy server can issue queries 
using round-robin scheduling, so that
+          each connection uses a different coordinator node. This 
load-balancing technique lets the Impala nodes
+          share this additional work, rather than concentrating it on a single 
machine.
+        </li>
+      </ul>
+
+      <p>
+        The following setup steps are a general outline that apply to any 
load-balancing proxy software:
+      </p>
+
+      <ol>
+        <li>
+          Download the load-balancing proxy software. It should only need to 
be installed and configured on a
+          single host. Pick a host other than the DataNodes where 
<cmdname>impalad</cmdname> is running,
+          because the intention is to protect against the possibility of one 
or more of these DataNodes becoming unavailable.
+        </li>
+
+        <li>
+          Configure the load balancer (typically by editing a configuration 
file).
+          In particular:
+          <ul>
+            <li>
+              <p>
+                Set up a port that the load balancer will listen on to relay 
Impala requests back and forth.
+              </p>
+            </li>
+            <li>
+              <p rev="DOCS-690">
+                Consider enabling <q>sticky sessions</q>. Cloudera recommends 
enabling this setting
+                so that stateless client applications such as 
<cmdname>impalad</cmdname> and Hue
+                are not disconnected from long-running queries. Evaluate 
whether this setting is
+                appropriate for your combination of workload and client 
applications.
+              </p>
+            </li>
+            <li>
+              <p>
+                For Kerberized clusters, follow the instructions in <xref 
href="impala_proxy.xml#proxy_kerberos"/>.
+              </p>
+            </li>
+          </ul>
+        </li>
+
+        <li>
+          Specify the host and port settings for each Impala node. These are 
the hosts that the load balancer will
+          choose from when relaying each Impala query. See <xref 
href="impala_ports.xml#ports"/> for when to use
+          port 21000, 21050, or another value depending on what type of 
connections you are load balancing.
+          <note rev="CDH-30399">
+            <p rev="CDH-30399">
+              In particular, if you are using Hue or JDBC-based applications,
+              you typically set up load balancing for both ports 21000 and 
21050, because
+              these client applications connect through port 21050 while the 
<cmdname>impala-shell</cmdname>
+              command connects through port 21000.
+            </p>
+          </note>
+        </li>
+
+        <li>
+          Run the load-balancing proxy server, pointing it at the 
configuration file that you set up.
+        </li>
+
+        <li>
+          On systems managed by Cloudera Manager, on the page
+          
<menucascade><uicontrol>Impala</uicontrol><uicontrol>Configuration</uicontrol><uicontrol>Impala
 Daemon
+          Default Group</uicontrol></menucascade>, specify a value for the 
<uicontrol>Impala Daemons Load
+          Balancer</uicontrol> field. Specify the address of the load balancer 
in
+          <codeph><varname>host</varname>:<varname>port</varname></codeph> 
format. This setting lets Cloudera
+          Manager route all appropriate Impala-related operations through the 
proxy server.
+        </li>
+
+        <li>
+          For any scripts, jobs, or configuration settings for applications 
that formerly connected to a specific
+          datanode to run Impala SQL statements, change the connection 
information (such as the <codeph>-i</codeph>
+          option in <cmdname>impala-shell</cmdname>) to point to the load 
balancer instead.
+        </li>
+      </ol>
+
+      <note>
+        The following sections use the HAProxy software as a representative 
example of a load balancer
+        that you can use with Impala.
+        For information specifically about using Impala with the F5 BIG-IP 
load balancer, see
+        <xref 
href="http://www.cloudera.com/documentation/other/reference-architecture/PDF/Impala-HA-with-F5-BIG-IP.pdf";
 scope="external" format="html">Impala HA with F5 BIG-IP</xref>.
+      </note>
+
+    </conbody>
+
+  </concept>
+
+  <concept id="proxy_balancing" rev="CDH-33836 DOCS-349 CDH-39925 CDH-36812" 
audience="Cloudera">
+    <title>Choosing the Load-Balancing Algorithm</title>
+    <conbody>
+      <p>
+        Load-balancing software offers a number of algorithms to distribute 
requests.
+        Each algorithm has its own characteristics that make it suitable in 
some situations
+        but not others.
+      </p>
+
+      <dl>
+        <dlentry>
+          <dt>leastconn</dt>
+          <dd>
+            Connects sessions to the coordinator with the fewest connections, 
to balance the load evenly.
+            Typically used for workloads consisting of many independent, 
short-running queries.
+            In configurations with only a few client machines, this setting 
can avoid having all
+            requests go to only a small set of coordinators.
+          </dd>
+        </dlentry>
+        <dlentry>
+          <dt>source affinity</dt>
+          <dd>
+            Sessions from the same IP address always go to the same 
coordinator.
+            A good choice for Impala workloads containing a mix of queries and
+            DDL statements, such as <codeph>CREATE TABLE</codeph> and 
<codeph>ALTER TABLE</codeph>.
+            Because the metadata changes from a DDL statement take time to 
propagate across the cluster,
+            prefer to use source affinity in this case. If necessary, run the 
DDL and subsequent
+            queries that depend on the results of the DDL through the same 
session, for example
+            by running <codeph>impala-shell -f 
<varname>script_file</varname></codeph> to submit
+            several statements through a single session.
+            An alternative is to set the query option 
<codeph>SYNC_DDL=1</codeph>
+            to hold back subsequent queries until the results of a DDL 
operation have propagated
+            throughout the cluster, but that is a relatively expensive setting.
+            Recommended for use with Hue.
+          </dd>
+        </dlentry>
+        <dlentry>
+          <dt>sticky</dt>
+          <dd>
+            Similar to source affinity. Sessions from the same IP address 
always go to the same coordinator.
+            The maintenance overhead for the <q>stick tables</q> can cause 
long-running Hue sessions
+            to disconnect, therefore source affinity is often a better choice.
+          </dd>
+        </dlentry>
+        <dlentry>
+          <dt>round-robin</dt>
+          <dd>
+            Distributes connections to all coordinator nodes.
+            Typically not recommended for Impala.
+          </dd>
+        </dlentry>
+      </dl>
+
+      <p>
+        You might need to perform benchmarks and load testing to determine 
which setting is optimal for your
+        use case. If some client applications have special characteristics, 
such as long-running Hue queries
+        working best with source affinity, you might configure multiple 
virtual IP addresses with a
+        different load-balancing algorithm for each.
+      </p>
+
+    </conbody>
+  </concept>
+
+  <concept id="proxy_kerberos">
+
+    <title>Special Proxy Considerations for Clusters Using Kerberos</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Kerberos"/>
+      <data name="Category" value="Authentication"/>
+      <data name="Category" value="Proxy"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        In a cluster using Kerberos, applications check host credentials to 
verify that the host they are
+        connecting to is the same one that is actually processing the request, 
to prevent man-in-the-middle
+        attacks. To clarify that the load-balancing proxy server is 
legitimate, perform these extra Kerberos setup
+        steps:
+      </p>
+
+      <ol>
+        <li>
+          This section assumes you are starting with a Kerberos-enabled 
cluster. See
+          <xref href="impala_kerberos.xml#kerberos"/> for instructions for 
setting up Impala with Kerberos. See the
+          <cite>CDH Security Guide</cite> for
+          <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/cdh_sg_kerberos_prin_keytab_deploy.html";
 scope="external" format="html">general steps to set up Kerberos</xref>.
+        </li>
+
+        <li>
+          Choose the host you will use for the proxy server. Based on the 
Kerberos setup procedure, it should
+          already have an entry 
<codeph>impala/<varname>proxy_host</varname>@<varname>realm</varname></codeph> 
in
+          its keytab. If not, go back over the initial Kerberos configuration 
steps for the keytab on each host
+          running the <cmdname>impalad</cmdname> daemon.
+        </li>
+
+        <li rev="CDH-40363">
+          For a cluster managed by Cloudera Manager (5.4.2 or higher), fill in 
the Impala configuration setting
+          <uicontrol>Impala Daemons Load Balancer</uicontrol> with the 
appropriate host:port combination.
+          Then restart the Impala service.
+          For systems using a recent level of Cloudera Manager, this is all 
the configuration you need; you can skip the remaining steps in this procedure.
+        </li>
+
+        <li>
+          On systems not managed by Cloudera Manager, or systems using 
Cloudera Manager earlier than 5.4.2:
+
+        <ol>
+          <li>
+            Copy the keytab file from the proxy host to all other hosts in the 
cluster that run the
+            <cmdname>impalad</cmdname> daemon. (For optimal performance, 
<cmdname>impalad</cmdname> should be running
+            on all DataNodes in the cluster.) Put the keytab file in a secure 
location on each of these other hosts.
+          </li>
+
+          <li>
+            Add an entry 
<codeph>impala/<varname>actual_hostname</varname>@<varname>realm</varname></codeph>
 to the keytab on each
+            host running the <cmdname>impalad</cmdname> daemon.
+          </li>
+
+          <li>
+            For each impalad node, merge the existing keytab with the 
proxyâs keytab using
+            <cmdname>ktutil</cmdname>, producing a new keytab file. For 
example:
+  <codeblock>$ ktutil
+  ktutil: read_kt proxy.keytab
+  ktutil: read_kt impala.keytab
+  ktutil: write_kt proxy_impala.keytab
+  ktutil: quit</codeblock>
+            <note>
+              On systems managed by Cloudera Manager 5.1.0 and later, the 
keytab merging happens automatically. To
+              verify that Cloudera Manager has merged the keytabs, run the 
command:
+  <codeblock>klist -k <varname>keytabfile</varname></codeblock>
+              which lists the credentials for both <codeph>principal</codeph> 
and <codeph>be_principal</codeph> on
+              all nodes.
+            </note>
+          </li>
+
+          <li>
+            Make sure that the <codeph>impala</codeph> user has permission to 
read this merged keytab file.
+          </li>
+
+          <li>
+            Change some configuration settings for each host in the cluster 
that participates in the load balancing.
+            Follow the appropriate steps depending on whether you use Cloudera 
Manager or not:
+            <ul>
+              <li> In the <cmdname>impalad</cmdname> option definition, or the 
advanced
+                configuration snippet, add: 
<codeblock>--principal=impala/<varname>proxy_host</varname>@<varname>realm</varname>
+  --be_principal=impala/<varname>actual_host</varname>@<varname>realm</varname>
+  --keytab_file=<varname>path_to_merged_keytab</varname></codeblock>
+                <note>
+                  <p>On a cluster managed by Cloudera Manager 5.1 (or higher),
+                    when you set up Kerberos authentication using the wizard, 
you
+                    can choose to allow Cloudera Manager to deploy the
+                      <systemoutput>krb5.conf</systemoutput> on your cluster. 
In
+                    such a case, you do not need to explicitly modify safety 
valve
+                    parameters as directed above. </p>
+                  <p>Every host has a different <codeph>--be_principal</codeph>
+                    because the actual hostname is different on each host. </p>
+                  <p> Specify the fully qualified domain name (FQDN) for the 
proxy
+                    host, not the IP address. Use the exact FQDN as returned 
by a
+                    reverse DNS lookup for the associated IP address. </p>
+                </note>
+              </li>
+
+              <li>
+                On a cluster managed by Cloudera Manager, create a role group 
to set the configuration values from
+                the preceding step on a per-host basis.
+              </li>
+
+              <li>
+                On a cluster not managed by Cloudera Manager, see
+                <xref href="impala_config_options.xml#config_options"/> for 
the procedure to modify the startup
+                options.
+              </li>
+            </ul>
+          </li>
+
+          <li>
+            Restart Impala to make the changes take effect. Follow the 
appropriate steps depending on whether you use
+            Cloudera Manager or not:
+            <ul>
+              <li>
+                On a cluster managed by Cloudera Manager, restart the Impala 
service.
+              </li>
+
+              <li>
+                On a cluster not managed by Cloudera Manager, restart the 
<cmdname>impalad</cmdname> daemons on all
+                hosts in the cluster, as well as the 
<cmdname>statestored</cmdname> and <cmdname>catalogd</cmdname>
+                daemons.
+              </li>
+            </ul>
+          </li>
+        </ol>
+        </li>
+      </ol>
+
+<!--
+We basically want to merge the keytab from the proxy host to all the impalad 
host's keytab file. To merge two keytab files, we first need to ship the proxy 
keytab to all the impalad node, then merge keytab files using MIT Kerberos 
"ktutil" command line tool.
+
+<codeblock>$ ktutil
+ktutil: read_kt krb5.keytab
+ktutil: read_kt proxy-host.keytab
+ktutil: write_kt krb5.keytab
+ktutil: quit</codeblock>
+
+The setup of the -principal and -be_principal has to be set through safety 
valve.
+-->
+
+    </conbody>
+
+  </concept>
+
+  <concept id="tut_proxy">
+
+    <title>Example of Configuring HAProxy Load Balancer for Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Configuring"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        If you are not already using a load-balancing proxy, you can 
experiment with
+        <xref href="http://haproxy.1wt.eu/"; scope="external" 
format="html">HAProxy</xref> a free, open source load
+        balancer. This example shows how you might install and configure that 
load balancer on a Red Hat Enterprise
+        Linux system.
+      </p>
+
+      <ul>
+        <li>
+          <p>
+            Install the load balancer: <codeph>yum install haproxy</codeph>
+          </p>
+        </li>
+
+        <li>
+          <p>
+            Set up the configuration file: 
<filepath>/etc/haproxy/haproxy.cfg</filepath>. See the following section
+            for a sample configuration file.
+          </p>
+        </li>
+
+        <li>
+          <p>
+            Run the load balancer (on a single host, preferably one not 
running <cmdname>impalad</cmdname>):
+          </p>
+<codeblock>/usr/sbin/haproxy âf /etc/haproxy/haproxy.cfg</codeblock>
+        </li>
+
+        <li>
+          <p>
+            In <cmdname>impala-shell</cmdname>, JDBC applications, or ODBC 
applications, connect to the listener
+            port of the proxy host, rather than port 21000 or 21050 on a host 
actually running <cmdname>impalad</cmdname>.
+            The sample configuration file sets haproxy to listen on port 
25003, therefore you would send all
+            requests to <codeph><varname>haproxy_host</varname>:25003</codeph>.
+          </p>
+        </li>
+      </ul>
+
+      <p>
+        This is the sample <filepath>haproxy.cfg</filepath> used in this 
example:
+      </p>
+
+<codeblock>global
+    # To have these messages end up in /var/log/haproxy.log you will
+    # need to:
+    #
+    # 1) configure syslog to accept network log events.  This is done
+    #    by adding the '-r' option to the SYSLOGD_OPTIONS in
+    #    /etc/sysconfig/syslog
+    #
+    # 2) configure local2 events to go to the /var/log/haproxy.log
+    #   file. A line like the following can be added to
+    #   /etc/sysconfig/syslog
+    #
+    #    local2.*                       /var/log/haproxy.log
+    #
+    log         127.0.0.1 local0
+    log         127.0.0.1 local1 notice
+    chroot      /var/lib/haproxy
+    pidfile     /var/run/haproxy.pid
+    maxconn     4000
+    user        haproxy
+    group       haproxy
+    daemon
+
+    # turn on stats unix socket
+    #stats socket /var/lib/haproxy/stats
+
+#---------------------------------------------------------------------
+# common defaults that all the 'listen' and 'backend' sections will
+# use if not designated in their block
+#
+# You might need to adjust timing values to prevent timeouts.
+#---------------------------------------------------------------------
+defaults
+    mode                    http
+    log                     global
+    option                  httplog
+    option                  dontlognull
+    option http-server-close
+    option forwardfor       except 127.0.0.0/8
+    option                  redispatch
+    retries                 3
+    maxconn                 3000
+    contimeout 5000
+    clitimeout 50000
+    srvtimeout 50000
+
+#
+# This sets up the admin page for HA Proxy at port 25002.
+#
+listen stats :25002
+    balance
+    mode http
+    stats enable
+    stats auth <varname>username</varname>:<varname>password</varname>
+
+# This is the setup for Impala. Impala client connect to 
load_balancer_host:25003.
+# HAProxy will balance connections among the list of servers listed below.
+# The list of Impalad is listening at port 21000 for beeswax (impala-shell) or 
original ODBC driver.
+# For JDBC or ODBC version 2.x driver, use port 21050 instead of 21000.
+listen impala :25003
+    mode tcp
+    option tcplog
+    balance leastconn
+
+    server <varname>symbolic_name_1</varname> impala-host-1.example.com:21000
+    server <varname>symbolic_name_2</varname> impala-host-2.example.com:21000
+    server <varname>symbolic_name_3</varname> impala-host-3.example.com:21000
+    server <varname>symbolic_name_4</varname> impala-host-4.example.com:21000
+
+# Setup for Hue or other JDBC-enabled applications.
+# In particular, Hue requires sticky sessions.
+# The application connects to load_balancer_host:21051, and HAProxy balances
+# connections to the associated hosts, where Impala listens for JDBC
+# requests on port 21050.
+listen impalajdbc :21051
+    mode tcp
+    option tcplog
+    balance source
+    server <varname>symbolic_name_5</varname> impala-host-1.example.com:21050
+    server <varname>symbolic_name_6</varname> impala-host-2.example.com:21050
+    server <varname>symbolic_name_7</varname> impala-host-3.example.com:21050
+    server <varname>symbolic_name_8</varname> impala-host-4.example.com:21050
+</codeblock>
+
+      <note conref="../shared/impala_common.xml#common/proxy_jdbc_caveat"/>
+
+      <p audience="Cloudera">
+        The following example shows extra steps needed for a cluster using 
Kerberos authentication:
+      </p>
+
+<codeblock audience="Cloudera">$ klist
+$ impala-shell -k
+$ kinit -r 1d -kt /systest/keytabs/hdfs.keytab hdfs
+$ impala-shell -i c2104.hal.cloudera.com:21000
+$ impala-shell -i c2104.hal.cloudera.com:25003
+[root@c2104 alan]# ps -ef |grep impalad
+root      6442  6428  0 12:21 pts/0    00:00:00 grep impalad
+impala   30577 22192 99 Nov14 ?        3-16:42:32 
/usr/lib/impala/sbin-debug/impalad 
--flagfile=/var/run/cloudera-scm-agent/process/10342-impala-IMPALAD/impala-conf/impalad_flags
+[root@c2104 alan]# vi 
/var/run/cloudera-scm-agent/process/10342-impala-IMPALAD/impala-conf/impalad_flags
+$ klist -k 
/var/run/cloudera-scm-agent/process/10342-impala-IMPALAD/impala.keytab
+Keytab name: 
FILE:/var/run/cloudera-scm-agent/process/10342-impala-IMPALAD/impala.keytab
+KVNO Principal
+---- --------------------------------------------------------------------------
+   2 impala/[email protected]
+   2 impala/[email protected]
+   2 impala/[email protected]
+   2 impala/[email protected]
+   2 HTTP/[email protected]
+   2 HTTP/[email protected]
+   2 HTTP/[email protected]
+   2 HTTP/[email protected]
+$ klist
+Ticket cache: FILE:/tmp/krb5cc_4028
+Default principal: [email protected]
+
+Valid starting     Expires            Service principal
+11/15/13 12:17:17  11/15/13 12:32:17  
krbtgt/[email protected]
+        renew until 11/16/13 12:17:17
+11/15/13 12:17:21  11/15/13 12:32:17  
impala/[email protected]
+        renew until 11/16/13 12:17:17
+$ kinit -r 1d -kt /systest/keytabs/hdfs.keytab hdfs
+$ kinit -R
+$ impala-shell -k -i c2106.hal.cloudera.com:21000
+Starting Impala Shell using Kerberos authentication
+Using service name 'impala'
+Connected to c2106.hal.cloudera.com:21000
+$ impala-shell -i c2104.hal.cloudera.com:25003
+$ impala-shell -k -i c2104.hal.cloudera.com:25003
+Starting Impala Shell using Kerberos authentication
+Using service name 'impala'
+Connected to c2104.hal.cloudera.com:25003
+[c2104.hal.cloudera.com:25003] &gt; create table alan_tmp(a int);
+Query: create table alan_tmp(a int)
+ERROR: InternalException: Got exception: org.apache.hadoop.ipc.RemoteException 
User: hive/[email protected] is not allowed to 
impersonate impala/[email protected]
+$ kdestroy
+$ kinit -r 1d -kt /systest/keytabs/hdfs.keytab hdfs
+$ impala-shell -k -i c2104.hal.cloudera.com:25003
+# klist -k c2104.keytab
+Keytab name: FILE:c2104.keytab
+KVNO Principal
+---- --------------------------------------------------------------------------
+   2 impala/[email protected]
+   2 impala/[email protected]
+   2 impala/[email protected]
+   2 impala/[email protected]
+   2 HTTP/[email protected]
+   2 HTTP/[email protected]
+   2 HTTP/[email protected]
+   2 HTTP/[email protected]
+$ klist -k -t c2106.keytab
+Keytab name: FILE:c2106.keytab
+KVNO Timestamp         Principal
+---- ----------------- --------------------------------------------------------
+   2 02/14/13 12:12:22 HTTP/[email protected]
+   2 02/14/13 12:12:22 HTTP/[email protected]
+   2 02/14/13 12:12:22 HTTP/[email protected]
+   2 02/14/13 12:12:22 HTTP/[email protected]
+   2 02/14/13 12:12:22 impala/[email protected]
+   2 02/14/13 12:12:22 impala/[email protected]
+   2 02/14/13 12:12:22 impala/[email protected]
+   2 02/14/13 12:12:22 impala/[email protected]
+$ ktutil
+ktutil:  rkt c2104.keytab
+ktutil:  rkt c2106.keytab
+ktutil:  wkt my_test.keytab
+ktutil:  q
+$ klist -k -t my_test.keytab
+Keytab name: FILE:my_test.keytab
+KVNO Timestamp         Principal
+---- ----------------- --------------------------------------------------------
+   2 11/21/13 16:22:40 impala/[email protected]
+   2 11/21/13 16:22:40 impala/[email protected]
+   2 11/21/13 16:22:40 impala/[email protected]
+   2 11/21/13 16:22:40 impala/[email protected]
+   2 11/21/13 16:22:40 HTTP/[email protected]
+   2 11/21/13 16:22:40 HTTP/[email protected]
+   2 11/21/13 16:22:40 HTTP/[email protected]
+   2 11/21/13 16:22:40 HTTP/[email protected]
+   2 11/21/13 16:22:40 HTTP/[email protected]
+   2 11/21/13 16:22:41 HTTP/[email protected]
+   2 11/21/13 16:22:41 HTTP/[email protected]
+   2 11/21/13 16:22:41 HTTP/[email protected]
+   2 11/21/13 16:22:41 impala/[email protected]
+   2 11/21/13 16:22:41 impala/[email protected]
+   2 11/21/13 16:22:41 impala/[email protected]
+   2 11/21/13 16:22:41 impala/[email protected]
+$ kdestroy
+$ kinit -r 1d -kt /systest/keytabs/hdfs.keytab hdfs
+$ vi README
+$ kinit -R
+$ impala-shell -k -i c2104.hal.cloudera.com:25003
+Starting Impala Shell using Kerberos authentication
+Using service name 'impala'
+Connected to c2104.hal.cloudera.com:25003
+<ph conref="../shared/ImpalaVariables.xml#impala_vars/ImpaladBanner"/>
+Welcome to the Impala shell. Press TAB twice to see a list of available 
commands.
+
+Copyright (c) 2012 Cloudera, Inc. All rights reserved.
+
+<ph conref="../shared/ImpalaVariables.xml#impala_vars/ShellBanner"/>
+[c2104.hal.cloudera.com:25003] &gt; show tables;
+Query: show tables
+ERROR: AnalysisException: This Impala daemon is not ready to accept user 
requests. Status: Waiting for catalog update from the StateStore.
+[c2104.hal.cloudera.com:25003] &gt; quit;</codeblock>
+
+      <!--
+        At that point in the walkthrough with Alan Choi, we could never get 
Impala to accept any requests through the catalog server.
+        So I have not seen a 100% successful proxy setup process to verify all 
the details.
+      -->
+
+    </conbody>
+
+  </concept>
+
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_rcfile.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_rcfile.xml b/docs/topics/impala_rcfile.xml
new file mode 100644
index 0000000..1bfab8c
--- /dev/null
+++ b/docs/topics/impala_rcfile.xml
@@ -0,0 +1,244 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="rcfile">
+
+  <title>Using the RCFile File Format with Impala Tables</title>
+  <titlealts audience="PDF"><navtitle>RCFile Data Files</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <!-- <data name="Category" value="RCFile"/> -->
+      <data name="Category" value="File Formats"/>
+      <data name="Category" value="Tables"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">RCFile support in Impala</indexterm>
+      Impala supports using RCFile data files.
+    </p>
+
+    <table>
+      <title>RCFile Format Support in Impala</title>
+      <tgroup cols="5">
+        <colspec colname="1" colwidth="10*"/>
+        <colspec colname="2" colwidth="10*"/>
+        <colspec colname="3" colwidth="20*"/>
+        <colspec colname="4" colwidth="30*"/>
+        <colspec colname="5" colwidth="30*"/>
+        <thead>
+          <row>
+            <entry>
+              File Type
+            </entry>
+            <entry>
+              Format
+            </entry>
+            <entry>
+              Compression Codecs
+            </entry>
+            <entry>
+              Impala Can CREATE?
+            </entry>
+            <entry>
+              Impala Can INSERT?
+            </entry>
+          </row>
+        </thead>
+        <tbody>
+          <row conref="impala_file_formats.xml#file_formats/rcfile_support">
+            <entry/>
+          </row>
+        </tbody>
+      </tgroup>
+    </table>
+
+    <p outputclass="toc inpage"/>
+  </conbody>
+
+  <concept id="rcfile_create">
+
+    <title>Creating RCFile Tables and Loading Data</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="ETL"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        If you do not have an existing data file to use, begin by creating one 
in the appropriate format.
+      </p>
+
+      <p>
+        <b>To create an RCFile table:</b>
+      </p>
+
+      <p>
+        In the <codeph>impala-shell</codeph> interpreter, issue a command 
similar to:
+      </p>
+
+<codeblock>create table rcfile_table (<varname>column_specs</varname>) stored 
as rcfile;</codeblock>
+
+      <p>
+        Because Impala can query some kinds of tables that it cannot currently 
write to, after creating tables of
+        certain file formats, you might use the Hive shell to load the data. 
See
+        <xref href="impala_file_formats.xml#file_formats"/> for details. After 
loading data into a table through
+        Hive or other mechanism outside of Impala, issue a <codeph>REFRESH 
<varname>table_name</varname></codeph>
+        statement the next time you connect to the Impala node, before 
querying the table, to make Impala recognize
+        the new data.
+      </p>
+
+      <note type="important">
+        See <xref href="impala_known_issues.xml#known_issues"/> for potential 
compatibility issues with
+        RCFile tables created in Hive 0.12, due to a change in the default 
RCFile SerDe for Hive.
+      </note>
+
+      <p>
+        For example, here is how you might create some RCFile tables in Impala 
(by specifying the columns
+        explicitly, or cloning the structure of another table), load data 
through Hive, and query them through
+        Impala:
+      </p>
+
+<codeblock>$ impala-shell -i localhost
+[localhost:21000] &gt; create table rcfile_table (x int) stored as rcfile;
+[localhost:21000] &gt; create table rcfile_clone like some_other_table stored 
as rcfile;
+[localhost:21000] &gt; quit;
+
+$ hive
+hive&gt; insert into table rcfile_table select x from some_other_table;
+3 Rows loaded to rcfile_table
+Time taken: 19.015 seconds
+hive&gt; quit;
+
+$ impala-shell -i localhost
+[localhost:21000] &gt; select * from rcfile_table;
+Returned 0 row(s) in 0.23s
+[localhost:21000] &gt; -- Make Impala recognize the data loaded through Hive;
+[localhost:21000] &gt; refresh rcfile_table;
+[localhost:21000] &gt; select * from rcfile_table;
++---+
+| x |
++---+
+| 1 |
+| 2 |
+| 3 |
++---+
+Returned 3 row(s) in 0.23s</codeblock>
+
+      <p 
conref="../shared/impala_common.xml#common/complex_types_unsupported_filetype"/>
+
+    </conbody>
+  </concept>
+
+  <concept id="rcfile_compression">
+
+    <title>Enabling Compression for RCFile Tables</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Snappy"/>
+      <data name="Category" value="Compression"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        <indexterm audience="Cloudera">compression</indexterm>
+        You may want to enable compression on existing tables. Enabling 
compression provides performance gains in
+        most cases and is supported for RCFile tables. For example, to enable 
Snappy compression, you would specify
+        the following additional settings when loading data through the Hive 
shell:
+      </p>
+
+<codeblock>hive&gt; SET hive.exec.compress.output=true;
+hive&gt; SET mapred.max.split.size=256000000;
+hive&gt; SET mapred.output.compression.type=BLOCK;
+hive&gt; SET 
mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
+hive&gt; INSERT OVERWRITE TABLE <varname>new_table</varname> SELECT * FROM 
<varname>old_table</varname>;</codeblock>
+
+      <p>
+        If you are converting partitioned tables, you must complete additional 
steps. In such a case, specify
+        additional settings similar to the following:
+      </p>
+
+<codeblock>hive&gt; CREATE TABLE <varname>new_table</varname> 
(<varname>your_cols</varname>) PARTITIONED BY 
(<varname>partition_cols</varname>) STORED AS <varname>new_format</varname>;
+hive&gt; SET hive.exec.dynamic.partition.mode=nonstrict;
+hive&gt; SET hive.exec.dynamic.partition=true;
+hive&gt; INSERT OVERWRITE TABLE <varname>new_table</varname> 
PARTITION(<varname>comma_separated_partition_cols</varname>) SELECT * FROM 
<varname>old_table</varname>;</codeblock>
+
+      <p>
+        Remember that Hive does not require that you specify a source format 
for it. Consider the case of
+        converting a table with two partition columns called 
<codeph>year</codeph> and <codeph>month</codeph> to a
+        Snappy compressed RCFile. Combining the components outlined previously 
to complete this table conversion,
+        you would specify settings similar to the following:
+      </p>
+
+<codeblock>hive&gt; CREATE TABLE tbl_rc (int_col INT, string_col STRING) 
STORED AS RCFILE;
+hive&gt; SET hive.exec.compress.output=true;
+hive&gt; SET mapred.max.split.size=256000000;
+hive&gt; SET mapred.output.compression.type=BLOCK;
+hive&gt; SET 
mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
+hive&gt; SET hive.exec.dynamic.partition.mode=nonstrict;
+hive&gt; SET hive.exec.dynamic.partition=true;
+hive&gt; INSERT OVERWRITE TABLE tbl_rc SELECT * FROM tbl;</codeblock>
+
+      <p>
+        To complete a similar process for a table that includes partitions, 
you would specify settings similar to
+        the following:
+      </p>
+
+<codeblock>hive&gt; CREATE TABLE tbl_rc (int_col INT, string_col STRING) 
PARTITIONED BY (year INT) STORED AS RCFILE;
+hive&gt; SET hive.exec.compress.output=true;
+hive&gt; SET mapred.max.split.size=256000000;
+hive&gt; SET mapred.output.compression.type=BLOCK;
+hive&gt; SET 
mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
+hive&gt; SET hive.exec.dynamic.partition.mode=nonstrict;
+hive&gt; SET hive.exec.dynamic.partition=true;
+hive&gt; INSERT OVERWRITE TABLE tbl_rc PARTITION(year) SELECT * FROM 
tbl;</codeblock>
+
+      <note>
+        <p>
+          The compression type is specified in the following command:
+        </p>
+<codeblock>SET 
mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;</codeblock>
+        <p>
+          You could elect to specify alternative codecs such as 
<codeph>GzipCodec</codeph> here.
+        </p>
+      </note>
+    </conbody>
+  </concept>
+
+  <concept id="rcfile_performance">
+
+    <title>Query Performance for Impala RCFile Tables</title>
+
+    <conbody>
+
+      <p>
+        In general, expect query performance with RCFile tables to be
+        faster than with tables using text data, but slower than with
+        Parquet tables. See <xref href="impala_parquet.xml#parquet"/>
+        for information about using the Parquet file format for
+        high-performance analytic queries.
+      </p>
+
+      <p conref="../shared/impala_common.xml#common/s3_block_splitting"/>
+
+    </conbody>
+  </concept>
+
+  <concept audience="Cloudera" id="rcfile_data_types">
+
+    <title>Data Type Considerations for RCFile Tables</title>
+
+    <conbody>
+
+      <p></p>
+    </conbody>
+  </concept>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_release_notes.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_release_notes.xml 
b/docs/topics/impala_release_notes.xml
new file mode 100644
index 0000000..65a3997
--- /dev/null
+++ b/docs/topics/impala_release_notes.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="impala_release_notes">
+
+  <title>Impala Release Notes</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Release Notes"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody conref="impala_relnotes.xml#relnotes/relnotes_intro"/>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_schema_design.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_schema_design.xml 
b/docs/topics/impala_schema_design.xml
new file mode 100644
index 0000000..4d08de5
--- /dev/null
+++ b/docs/topics/impala_schema_design.xml
@@ -0,0 +1,222 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="schema_design">
+
+  <title>Guidelines for Designing Impala Schemas</title>
+  <titlealts audience="PDF"><navtitle>Designing Schemas</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Planning"/>
+      <data name="Category" value="Sectionated Pages"/>
+      <data name="Category" value="Proof of Concept"/>
+      <data name="Category" value="Checklists"/>
+      <data name="Category" value="Guidelines"/>
+      <data name="Category" value="Best Practices"/>
+      <data name="Category" value="Performance"/>
+      <data name="Category" value="Compression"/>
+      <data name="Category" value="Tables"/>
+      <data name="Category" value="Schemas"/>
+      <data name="Category" value="SQL"/>
+      <data name="Category" value="Porting"/>
+      <data name="Category" value="Proof of Concept"/>
+      <data name="Category" value="Administrators"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      The guidelines in this topic help you to construct an optimized and 
scalable schema, one that integrates well
+      with your existing data management processes. Use these guidelines as a 
checklist when doing any
+      proof-of-concept work, porting exercise, or before deploying to 
production.
+    </p>
+
+    <p>
+      If you are adapting an existing database or Hive schema for use with 
Impala, read the guidelines in this
+      section and then see <xref href="impala_porting.xml#porting"/> for 
specific porting and compatibility tips.
+    </p>
+
+    <p outputclass="toc inpage"/>
+
+    <section id="schema_design_text_vs_binary">
+
+      <title>Prefer binary file formats over text-based formats.</title>
+
+      <p>
+        To save space and improve memory usage and query performance, use 
binary file formats for any large or
+        intensively queried tables. Parquet file format is the most efficient 
for data warehouse-style analytic
+        queries. Avro is the other binary file format that Impala supports, 
that you might already have as part of
+        a Hadoop ETL pipeline.
+      </p>
+
+      <p>
+        Although Impala can create and query tables with the RCFile and 
SequenceFile file formats, such tables are
+        relatively bulky due to the text-based nature of those formats, and 
are not optimized for data
+        warehouse-style queries due to their row-oriented layout. Impala does 
not support <codeph>INSERT</codeph>
+        operations for tables with these file formats.
+      </p>
+
+      <p>
+        Guidelines:
+      </p>
+
+      <ul>
+        <li>
+          For an efficient and scalable format for large, performance-critical 
tables, use the Parquet file format.
+        </li>
+
+        <li>
+          To deliver intermediate data during the ETL process, in a format 
that can also be used by other Hadoop
+          components, Avro is a reasonable choice.
+        </li>
+
+        <li>
+          For convenient import of raw data, use a text table instead of 
RCFile or SequenceFile, and convert to
+          Parquet in a later stage of the ETL process.
+        </li>
+      </ul>
+    </section>
+
+    <section id="schema_design_compression">
+
+      <title>Use Snappy compression where practical.</title>
+
+      <p>
+        Snappy compression involves low CPU overhead to decompress, while 
still providing substantial space
+        savings. In cases where you have a choice of compression codecs, such 
as with the Parquet and Avro file
+        formats, use Snappy compression unless you find a compelling reason to 
use a different codec.
+      </p>
+    </section>
+
+    <section id="schema_design_numeric_types">
+
+      <title>Prefer numeric types over strings.</title>
+
+      <p>
+        If you have numeric values that you could treat as either strings or 
numbers (such as
+        <codeph>YEAR</codeph>, <codeph>MONTH</codeph>, and 
<codeph>DAY</codeph> for partition key columns), define
+        them as the smallest applicable integer types. For example, 
<codeph>YEAR</codeph> can be
+        <codeph>SMALLINT</codeph>, <codeph>MONTH</codeph> and 
<codeph>DAY</codeph> can be <codeph>TINYINT</codeph>.
+        Although you might not see any difference in the way partitioned 
tables or text files are laid out on disk,
+        using numeric types will save space in binary formats such as Parquet, 
and in memory when doing queries,
+        particularly resource-intensive queries such as joins.
+      </p>
+    </section>
+
+<!-- Alan suggests not making this recommendation.
+<section id="schema_design_decimal">
+<title>Prefer DECIMAL types over FLOAT and DOUBLE.</title>
+<p>
+</p>
+</section>
+-->
+
+    <section id="schema_design_partitioning">
+
+      <title>Partition, but do not over-partition.</title>
+
+      <p>
+        Partitioning is an important aspect of performance tuning for Impala. 
Follow the procedures in
+        <xref href="impala_partitioning.xml#partitioning"/> to set up 
partitioning for your biggest, most
+        intensively queried tables.
+      </p>
+
+      <p>
+        If you are moving to Impala from a traditional database system, or 
just getting started in the Big Data
+        field, you might not have enough data volume to take advantage of 
Impala parallel queries with your
+        existing partitioning scheme. For example, if you have only a few tens 
of megabytes of data per day,
+        partitioning by <codeph>YEAR</codeph>, <codeph>MONTH</codeph>, and 
<codeph>DAY</codeph> columns might be
+        too granular. Most of your cluster might be sitting idle during 
queries that target a single day, or each
+        node might have very little work to do. Consider reducing the number 
of partition key columns so that each
+        partition directory contains several gigabytes worth of data.
+      </p>
+
+      <p rev="parquet_block_size">
+        For example, consider a Parquet table where each data file is 1 HDFS 
block, with a maximum block size of 1
+        GB. (In Impala 2.0 and later, the default Parquet block size is 
reduced to 256 MB. For this exercise, let's
+        assume you have bumped the size back up to 1 GB by setting the query 
option
+        <codeph>PARQUET_FILE_SIZE=1g</codeph>.) if you have a 10-node cluster, 
you need 10 data files (up to 10 GB)
+        to give each node some work to do for a query. But each core on each 
machine can process a separate data
+        block in parallel. With 16-core machines on a 10-node cluster, a query 
could process up to 160 GB fully in
+        parallel. If there are only a few data files per partition, not only 
are most cluster nodes sitting idle
+        during queries, so are most cores on those machines.
+      </p>
+
+      <p>
+        You can reduce the Parquet block size to as low as 128 MB or 64 MB to 
increase the number of files per
+        partition and improve parallelism. But also consider reducing the 
level of partitioning so that analytic
+        queries have enough data to work with.
+      </p>
+    </section>
+
+    <section id="schema_design_compute_stats">
+
+      <title>Always compute stats after loading data.</title>
+
+      <p>
+        Impala makes extensive use of statistics about data in the overall 
table and in each column, to help plan
+        resource-intensive operations such as join queries and inserting into 
partitioned Parquet tables. Because
+        this information is only available after data is loaded, run the 
<codeph>COMPUTE STATS</codeph> statement
+        on a table after loading or replacing data in a table or partition.
+      </p>
+
+      <p>
+        Having accurate statistics can make the difference between a 
successful operation, or one that fails due to
+        an out-of-memory error or a timeout. When you encounter performance or 
capacity issues, always use the
+        <codeph>SHOW STATS</codeph> statement to check if the statistics are 
present and up-to-date for all tables
+        in the query.
+      </p>
+
+      <p>
+        When doing a join query, Impala consults the statistics for each 
joined table to determine their relative
+        sizes and to estimate the number of rows produced in each join stage. 
When doing an <codeph>INSERT</codeph>
+        into a Parquet table, Impala consults the statistics for the source 
table to determine how to distribute
+        the work of constructing the data files for each partition.
+      </p>
+
+      <p>
+        See <xref href="impala_compute_stats.xml#compute_stats"/> for the 
syntax of the <codeph>COMPUTE
+        STATS</codeph> statement, and <xref 
href="impala_perf_stats.xml#perf_stats"/> for all the performance
+        considerations for table and column statistics.
+      </p>
+    </section>
+
+    <section id="schema_design_explain">
+
+      <title>Verify sensible execution plans with EXPLAIN and SUMMARY.</title>
+
+      <p>
+        Before executing a resource-intensive query, use the 
<codeph>EXPLAIN</codeph> statement to get an overview
+        of how Impala intends to parallelize the query and distribute the 
work. If you see that the query plan is
+        inefficient, you can take tuning steps such as changing file formats, 
using partitioned tables, running the
+        <codeph>COMPUTE STATS</codeph> statement, or adding query hints. For 
information about all of these
+        techniques, see <xref href="impala_performance.xml#performance"/>.
+      </p>
+
+      <p>
+        After you run a query, you can see performance-related information 
about how it actually ran by issuing the
+        <codeph>SUMMARY</codeph> command in <cmdname>impala-shell</cmdname>. 
Prior to Impala 1.4, you would use
+        the <codeph>PROFILE</codeph> command, but its highly technical output 
was only useful for the most
+        experienced users. <codeph>SUMMARY</codeph>, new in Impala 1.4, 
summarizes the most useful information for
+        all stages of execution, for all nodes rather than splitting out 
figures for each node.
+      </p>
+    </section>
+
+<!--
+<section id="schema_design_mem_limits">
+<title>Allocate resources Between Impala and batch jobs (MapReduce, Hive, 
Pig).</title>
+<p>
+</p>
+</section>
+
+<section id="schema_design_cm">
+<title>Use Cloudera Manager to monitor queries and overall performance.</title>
+<p>
+</p>
+</section>
+-->
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_security_files.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_security_files.xml 
b/docs/topics/impala_security_files.xml
new file mode 100644
index 0000000..befe696
--- /dev/null
+++ b/docs/topics/impala_security_files.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="secure_files">
+
+  <title>Securing Impala Data and Log Files</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Logs"/>
+      <data name="Category" value="HDFS"/>
+      <data name="Category" value="Administrators"/>
+      <!-- To do for John: mention redaction as a fallback to keep sensitive 
info out of the log files. -->
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      One aspect of security is to protect files from unauthorized access at 
the filesystem level. For example, if
+      you store sensitive data in HDFS, you specify permissions on the 
associated files and directories in HDFS to
+      restrict read and write permissions to the appropriate users and groups.
+    </p>
+
+    <p>
+      If you issue queries containing sensitive values in the 
<codeph>WHERE</codeph> clause, such as financial
+      account numbers, those values are stored in Impala log files in the 
Linux filesystem and you must secure
+      those files also. For the locations of Impala log files, see <xref 
href="impala_logging.xml#logging"/>.
+    </p>
+
+    <p>
+      All Impala read and write operations are performed under the filesystem 
privileges of the
+      <codeph>impala</codeph> user. The <codeph>impala</codeph> user must be 
able to read all directories and data
+      files that you query, and write into all the directories and data files 
for <codeph>INSERT</codeph> and
+      <codeph>LOAD DATA</codeph> statements. At a minimum, make sure the 
<codeph>impala</codeph> user is in the
+      <codeph>hive</codeph> group so that it can access files and directories 
shared between Impala and Hive. See
+      <xref href="impala_prereqs.xml#prereqs_account"/> for more details.
+    </p>
+
+    <p>
+      Setting file permissions is necessary for Impala to function correctly, 
but is not an effective security
+      practice by itself:
+    </p>
+
+    <ul>
+      <li>
+      <p>
+        The way to ensure that only authorized users can submit requests for 
databases and tables they are allowed
+        to access is to set up Sentry authorization, as explained in
+        <xref href="impala_authorization.xml#authorization"/>. With 
authorization enabled, the checking of the user
+        ID and group is done by Impala, and unauthorized access is blocked by 
Impala itself. The actual low-level
+        read and write requests are still done by the <codeph>impala</codeph> 
user, so you must have appropriate
+        file and directory permissions for that user ID.
+      </p>
+      </li>
+
+      <li>
+      <p>
+        You must also set up Kerberos authentication, as described in <xref 
href="impala_kerberos.xml#kerberos"/>,
+        so that users can only connect from trusted hosts. With Kerberos 
enabled, if someone connects a new host to
+        the network and creates user IDs that match your privileged IDs, they 
will be blocked from connecting to
+        Impala at all from that host.
+      </p>
+      </li>
+    </ul>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_security_guidelines.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_security_guidelines.xml 
b/docs/topics/impala_security_guidelines.xml
new file mode 100644
index 0000000..e7713ff
--- /dev/null
+++ b/docs/topics/impala_security_guidelines.xml
@@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="security_guidelines">
+
+  <title>Security Guidelines for Impala</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Planning"/>
+      <data name="Category" value="Guidelines"/>
+      <data name="Category" value="Best Practices"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      The following are the major steps to harden a cluster running Impala 
against accidents and mistakes, or
+      malicious attackers trying to access sensitive data:
+    </p>
+
+    <ul>
+      <li>
+      <p>
+        Secure the <codeph>root</codeph> account. The <codeph>root</codeph> 
user can tamper with the
+        <cmdname>impalad</cmdname> daemon, read and write the data files in 
HDFS, log into other user accounts, and
+        access other system services that are beyond the control of Impala.
+      </p>
+      </li>
+
+      <li>
+      <p>
+        Restrict membership in the <codeph>sudoers</codeph> list (in the 
<filepath>/etc/sudoers</filepath> file).
+        The users who can run the <codeph>sudo</codeph> command can do many of 
the same things as the
+        <codeph>root</codeph> user.
+      </p>
+      </li>
+
+      <li>
+      <p>
+        Ensure the Hadoop ownership and permissions for Impala data files are 
restricted.
+      </p>
+      </li>
+
+      <li>
+      <p>
+        Ensure the Hadoop ownership and permissions for Impala log files are 
restricted.
+      </p>
+      </li>
+
+      <li>
+      <p>
+        Ensure that the Impala web UI (available by default on port 25000 on 
each Impala node) is
+        password-protected. See <xref href="impala_webui.xml#webui"/> for 
details.
+      </p>
+      </li>
+
+      <li>
+      <p>
+        Create a policy file that specifies which Impala privileges are 
available to users in particular Hadoop
+        groups (which by default map to Linux OS groups). Create the 
associated Linux groups using the
+        <cmdname>groupadd</cmdname> command if necessary.
+      </p>
+      </li>
+
+      <li>
+      <p>
+        The Impala authorization feature makes use of the HDFS file ownership 
and permissions mechanism; for
+        background information, see the
+        <xref 
href="https://archive.cloudera.com/cdh/3/hadoop/hdfs_permissions_guide.html"; 
scope="external" format="html">CDH
+        HDFS Permissions Guide</xref>. Set up users and assign them to groups 
at the OS level, corresponding to the
+        different categories of users with different access levels for various 
databases, tables, and HDFS
+        locations (URIs). Create the associated Linux users using the 
<cmdname>useradd</cmdname> command if
+        necessary, and add them to the appropriate groups with the 
<cmdname>usermod</cmdname> command.
+      </p>
+      </li>
+
+      <li>
+      <p>
+        Design your databases, tables, and views with database and table 
structure to allow policy rules to specify
+        simple, consistent rules. For example, if all tables related to an 
application are inside a single
+        database, you can assign privileges for that database and use the 
<codeph>*</codeph> wildcard for the table
+        name. If you are creating views with different privileges than the 
underlying base tables, you might put
+        the views in a separate database so that you can use the 
<codeph>*</codeph> wildcard for the database
+        containing the base tables, while specifying the precise names of the 
individual views. (For specifying
+        table or database names, you either specify the exact name or 
<codeph>*</codeph> to mean all the databases
+        on a server, or all the tables and views in a database.)
+      </p>
+      </li>
+
+      <li>
+      <p>
+        Enable authorization by running the <codeph>impalad</codeph> daemons 
with the <codeph>-server_name</codeph>
+        and <codeph>-authorization_policy_file</codeph> options on all nodes. 
(The authorization feature does not
+        apply to the <cmdname>statestored</cmdname> daemon, which has no 
access to schema objects or data files.)
+      </p>
+      </li>
+
+      <li>
+      <p>
+        Set up authentication using Kerberos, to make sure users really are 
who they say they are.
+      </p>
+      </li>
+    </ul>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_security_install.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_security_install.xml 
b/docs/topics/impala_security_install.xml
new file mode 100644
index 0000000..56d34bc
--- /dev/null
+++ b/docs/topics/impala_security_install.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="security_install">
+
+  <title>Installation Considerations for Impala Security</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      Impala 1.1 comes set up with all the software and settings needed to 
enable security when you run the
+      <cmdname>impalad</cmdname> daemon with the new security-related options 
(<codeph>-server_name</codeph> and
+      <codeph>-authorization_policy_file</codeph>). You do not need to change 
any environment variables or install
+      any additional JAR files. In a cluster managed by Cloudera Manager, you 
do not need to change any settings in
+      Cloudera Manager.
+    </p>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_security_metastore.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_security_metastore.xml 
b/docs/topics/impala_security_metastore.xml
new file mode 100644
index 0000000..246333f
--- /dev/null
+++ b/docs/topics/impala_security_metastore.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="security_metastore">
+
+  <title>Securing the Hive Metastore Database</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Hive"/>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Metastore"/>
+      <data name="Category" value="Databases"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+<!-- Some of this copied from earlier. Split out both instances into conrefs. 
-->
+
+    <p>
+      It is important to secure the Hive metastore, so that users cannot 
access the names or other information
+      about databases and tables the through the Hive client or by querying 
the metastore database. Do this by
+      turning on Hive metastore security, using the instructions in the
+      <xref 
href="http://www.cloudera.com/documentation/enterprise/latest/topics/cdh_sg_hive_security.html";
 scope="external" format="html">CDH 5 Security Guide</xref>
+      for securing different Hive components:
+    </p>
+
+    <ul>
+      <li>
+        Secure the Hive Metastore.
+      </li>
+
+      <li>
+        In addition, allow access to the metastore only from the HiveServer2 
server, and then disable local access
+        to the HiveServer2 server.
+      </li>
+    </ul>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_security_webui.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_security_webui.xml 
b/docs/topics/impala_security_webui.xml
new file mode 100644
index 0000000..7ebd2ef
--- /dev/null
+++ b/docs/topics/impala_security_webui.xml
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="1.1" id="security_webui">
+
+  <title>Securing the Impala Web User Interface</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Security"/>
+      <data name="Category" value="Administrators"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      The instructions in this section presume you are familiar with the
+      <xref href="http://en.wikipedia.org/wiki/.htpasswd"; scope="external" 
format="html">
+      <filepath>.htpasswd</filepath> mechanism</xref> commonly used to 
password-protect pages on web servers.
+    </p>
+
+    <p>
+      Password-protect the Impala web UI that listens on port 25000 by 
default. Set up a
+      <filepath>.htpasswd</filepath> file in the <codeph>$IMPALA_HOME</codeph> 
directory, or start both the
+      <cmdname>impalad</cmdname> and <cmdname>statestored</cmdname> daemons 
with the
+      <codeph>--webserver_password_file</codeph> option to specify a different 
location (including the filename).
+    </p>
+
+    <p>
+      This file should only be readable by the Impala process and machine 
administrators, because it contains
+      (hashed) versions of passwords. The username / password pairs are not 
derived from Unix usernames, Kerberos
+      users, or any other system. The <codeph>domain</codeph> field in the 
password file must match the domain
+      supplied to Impala by the new command-line option 
<codeph>--webserver_authentication_domain</codeph>. The
+      default is <codeph>mydomain.com</codeph>.
+<!-- Password generator cited by Henry: <xref 
href="http://www.askapache.com/online-tools/htpasswd-generator/"; 
scope="external" format="html"/> -->
+    </p>
+
+    <p>
+      Impala also supports using HTTPS for secure web traffic. To do so, set
+      <codeph>--webserver_certificate_file</codeph> to refer to a valid 
<codeph>.pem</codeph> TLS/SSL certificate file.
+      Impala will automatically start using HTTPS once the TLS/SSL certificate 
has been read and validated. A
+      <codeph>.pem</codeph> file is basically a private key, followed by a 
signed TLS/SSL certificate; make sure to
+      concatenate both parts when constructing the <codeph>.pem</codeph> file.
+<!-- Certificate info cited by Henry: <xref 
href="http://www.akadia.com/services/ssh_test_certificate.html"; 
scope="external" format="html"/>
+This page was very useful for creating a certificate and private key file;
+the last step which was missing was to append one file to the other to make 
the <codeph>.pem</codeph> file. -->
+    </p>
+
+    <p>
+      If Impala cannot find or parse the <codeph>.pem</codeph> file, it prints 
an error message and quits.
+    </p>
+
+    <note>
+      <p>
+        If the private key is encrypted using a passphrase, Impala will ask 
for that passphrase on startup, which
+        is not useful for a large cluster. In that case, remove the passphrase 
and make the <codeph>.pem</codeph>
+        file readable only by Impala and administrators.
+      </p>
+      <p>
+        When you turn on TLS/SSL for the Impala web UI, the associated URLs 
change from <codeph>http://</codeph>
+        prefixes to <codeph>https://</codeph>. Adjust any bookmarks or 
application code that refers to those URLs.
+      </p>
+    </note>
+  </conbody>
+</concept>

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/1fcc8cee/docs/topics/impala_seqfile.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_seqfile.xml b/docs/topics/impala_seqfile.xml
new file mode 100644
index 0000000..860007e
--- /dev/null
+++ b/docs/topics/impala_seqfile.xml
@@ -0,0 +1,239 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="seqfile">
+
+  <title id="sequencefile">Using the SequenceFile File Format with Impala 
Tables</title>
+  <titlealts audience="PDF"><navtitle>SequenceFile Data 
Files</navtitle></titlealts>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <!-- <data name="Category" value="SequenceFile"/> -->
+      <data name="Category" value="File Formats"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      <indexterm audience="Cloudera">SequenceFile support in Impala</indexterm>
+      Impala supports using SequenceFile data files.
+    </p>
+
+    <table>
+      <title>SequenceFile Format Support in Impala</title>
+      <tgroup cols="5">
+        <colspec colname="1" colwidth="10*"/>
+        <colspec colname="2" colwidth="10*"/>
+        <colspec colname="3" colwidth="20*"/>
+        <colspec colname="4" colwidth="30*"/>
+        <colspec colname="5" colwidth="30*"/>
+        <thead>
+          <row>
+            <entry>
+              File Type
+            </entry>
+            <entry>
+              Format
+            </entry>
+            <entry>
+              Compression Codecs
+            </entry>
+            <entry>
+              Impala Can CREATE?
+            </entry>
+            <entry>
+              Impala Can INSERT?
+            </entry>
+          </row>
+        </thead>
+        <tbody>
+          <row 
conref="impala_file_formats.xml#file_formats/sequencefile_support">
+            <entry/>
+          </row>
+        </tbody>
+      </tgroup>
+    </table>
+
+    <p outputclass="toc inpage"/>
+  </conbody>
+
+  <concept id="seqfile_create">
+
+    <title>Creating SequenceFile Tables and Loading Data</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="ETL"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        If you do not have an existing data file to use, begin by creating one 
in the appropriate format.
+      </p>
+
+      <p>
+        <b>To create a SequenceFile table:</b>
+      </p>
+
+      <p>
+        In the <codeph>impala-shell</codeph> interpreter, issue a command 
similar to:
+      </p>
+
+<codeblock>create table sequencefile_table (<varname>column_specs</varname>) 
stored as sequencefile;</codeblock>
+
+      <p>
+        Because Impala can query some kinds of tables that it cannot currently 
write to, after creating tables of
+        certain file formats, you might use the Hive shell to load the data. 
See
+        <xref href="impala_file_formats.xml#file_formats"/> for details. After 
loading data into a table through
+        Hive or other mechanism outside of Impala, issue a <codeph>REFRESH 
<varname>table_name</varname></codeph>
+        statement the next time you connect to the Impala node, before 
querying the table, to make Impala recognize
+        the new data.
+      </p>
+
+      <p>
+        For example, here is how you might create some SequenceFile tables in 
Impala (by specifying the columns
+        explicitly, or cloning the structure of another table), load data 
through Hive, and query them through
+        Impala:
+      </p>
+
+<codeblock>$ impala-shell -i localhost
+[localhost:21000] &gt; create table seqfile_table (x int) stored as 
sequencefile;
+[localhost:21000] &gt; create table seqfile_clone like some_other_table stored 
as sequencefile;
+[localhost:21000] &gt; quit;
+
+$ hive
+hive&gt; insert into table seqfile_table select x from some_other_table;
+3 Rows loaded to seqfile_table
+Time taken: 19.047 seconds
+hive&gt; quit;
+
+$ impala-shell -i localhost
+[localhost:21000] &gt; select * from seqfile_table;
+Returned 0 row(s) in 0.23s
+[localhost:21000] &gt; -- Make Impala recognize the data loaded through Hive;
+[localhost:21000] &gt; refresh seqfile_table;
+[localhost:21000] &gt; select * from seqfile_table;
++---+
+| x |
++---+
+| 1 |
+| 2 |
+| 3 |
++---+
+Returned 3 row(s) in 0.23s</codeblock>
+
+      <p 
conref="../shared/impala_common.xml#common/complex_types_unsupported_filetype"/>
+
+    </conbody>
+  </concept>
+
+  <concept id="seqfile_compression">
+
+    <title>Enabling Compression for SequenceFile Tables</title>
+  <prolog>
+    <metadata>
+      <data name="Category" value="Compression"/>
+      <data name="Category" value="Snappy"/>
+    </metadata>
+  </prolog>
+
+    <conbody>
+
+      <p>
+        <indexterm audience="Cloudera">compression</indexterm>
+        You may want to enable compression on existing tables. Enabling 
compression provides performance gains in
+        most cases and is supported for SequenceFile tables. For example, to 
enable Snappy compression, you would
+        specify the following additional settings when loading data through 
the Hive shell:
+      </p>
+
+<codeblock>hive&gt; SET hive.exec.compress.output=true;
+hive&gt; SET mapred.max.split.size=256000000;
+hive&gt; SET mapred.output.compression.type=BLOCK;
+hive&gt; SET 
mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
+hive&gt; insert overwrite table <varname>new_table</varname> select * from 
<varname>old_table</varname>;</codeblock>
+
+      <p>
+        If you are converting partitioned tables, you must complete additional 
steps. In such a case, specify
+        additional settings similar to the following:
+      </p>
+
+<codeblock>hive&gt; create table <varname>new_table</varname> 
(<varname>your_cols</varname>) partitioned by 
(<varname>partition_cols</varname>) stored as <varname>new_format</varname>;
+hive&gt; SET hive.exec.dynamic.partition.mode=nonstrict;
+hive&gt; SET hive.exec.dynamic.partition=true;
+hive&gt; insert overwrite table <varname>new_table</varname> 
partition(<varname>comma_separated_partition_cols</varname>) select * from 
<varname>old_table</varname>;</codeblock>
+
+      <p>
+        Remember that Hive does not require that you specify a source format 
for it. Consider the case of
+        converting a table with two partition columns called 
<codeph>year</codeph> and <codeph>month</codeph> to a
+        Snappy compressed SequenceFile. Combining the components outlined 
previously to complete this table
+        conversion, you would specify settings similar to the following:
+      </p>
+
+<codeblock>hive&gt; create table TBL_SEQ (int_col int, string_col string) 
STORED AS SEQUENCEFILE;
+hive&gt; SET hive.exec.compress.output=true;
+hive&gt; SET mapred.max.split.size=256000000;
+hive&gt; SET mapred.output.compression.type=BLOCK;
+hive&gt; SET 
mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
+hive&gt; SET hive.exec.dynamic.partition.mode=nonstrict;
+hive&gt; SET hive.exec.dynamic.partition=true;
+hive&gt; INSERT OVERWRITE TABLE tbl_seq SELECT * FROM tbl;</codeblock>
+
+      <p>
+        To complete a similar process for a table that includes partitions, 
you would specify settings similar to
+        the following:
+      </p>
+
+<codeblock>hive&gt; CREATE TABLE tbl_seq (int_col INT, string_col STRING) 
PARTITIONED BY (year INT) STORED AS SEQUENCEFILE;
+hive&gt; SET hive.exec.compress.output=true;
+hive&gt; SET mapred.max.split.size=256000000;
+hive&gt; SET mapred.output.compression.type=BLOCK;
+hive&gt; SET 
mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
+hive&gt; SET hive.exec.dynamic.partition.mode=nonstrict;
+hive&gt; SET hive.exec.dynamic.partition=true;
+hive&gt; INSERT OVERWRITE TABLE tbl_seq PARTITION(year) SELECT * FROM 
tbl;</codeblock>
+
+      <note>
+        <p>
+          The compression type is specified in the following command:
+        </p>
+<codeblock>SET 
mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;</codeblock>
+        <p>
+          You could elect to specify alternative codecs such as 
<codeph>GzipCodec</codeph> here.
+        </p>
+      </note>
+    </conbody>
+  </concept>
+
+  <concept audience="Cloudera" id="seqfile_data_types">
+
+    <title>Data Type Considerations for SequenceFile Tables</title>
+
+    <conbody>
+
+      <p></p>
+    </conbody>
+  </concept>
+
+  <concept id="seqfile_performance">
+
+    <title>Query Performance for Impala SequenceFile Tables</title>
+
+    <conbody>
+
+      <p>
+        In general, expect query performance with SequenceFile tables to be
+        faster than with tables using text data, but slower than with
+        Parquet tables. See <xref href="impala_parquet.xml#parquet"/>
+        for information about using the Parquet file format for
+        high-performance analytic queries.
+      </p>
+
+      <p conref="../shared/impala_common.xml#common/s3_block_splitting"/>
+
+    </conbody>
+  </concept>
+
+</concept>

[3/7] incubator-impala git commit: New files needed to make PDF build happy.

Reply via email to